安裝/載入必要套件
# 請先安裝套件
install.packages("rpart") # 安裝 `reshape2` 套件
install.packages("rpart.plot")
install.packages("RColorBrewer")
install.packages("ggdendro")
# 載入套件
library(rpart) # Recursive partitioning
library(rpart.plot) # Fancy tree plot
library(RColorBrewer) # Nice color palettes
library(ggdendro)
The Obama-Clinton Divide Data
primary <- read.csv("https://johnsonhsieh.github.io/DSC2016-R/data/primaries.csv")
str(primary)
'data.frame': 2450 obs. of 50 variables:
$ fips : int 1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
$ county_name : Factor w/ 1425 levels "Abbeville","Acadia",..: 70 76 84 123 134 176 183 192 228 247 ...
$ state_postal : Factor w/ 37 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
$ region : Factor w/ 4 levels "MW","NE","S",..: 3 3 3 3 3 3 3 3 3 3 ...
$ election_date : Factor w/ 13 levels "1/15/08","1/19/08",..: 9 9 9 9 9 9 9 9 9 9 ...
$ racetype : Factor w/ 2 levels "Caucus","Primary": 2 2 2 2 2 2 2 2 2 2 ...
$ tvotes : int 4118 12085 3823 1751 3471 2540 2775 11699 3958 2689 ...
$ clinton : int 1760 6259 1322 922 2735 471 1143 5942 1448 2040 ...
$ obama : int 2268 5450 2393 755 617 2032 1557 5471 2409 485 ...
$ edwards : int 57 227 60 44 75 6 40 154 54 105 ...
$ margin : num 0.1234 -0.0669 0.2801 -0.0954 -0.6102 ...
$ winner : Factor w/ 2 levels "clinton","obama": 2 1 2 1 1 2 2 1 2 1 ...
$ POP05_SQMI : num 78.9 97 32.3 33.4 80.7 ...
$ popUnder30_00 : num 30.3 27.2 31.4 32.8 30.2 31.6 26.4 30.5 28.5 27.8 ...
$ pop65up_00 : num 10.2 15.5 13.3 11.6 12.9 13.2 16.4 14.1 16.2 15.9 ...
$ presVote04 : int 20081 69320 10777 7600 21504 4717 8416 45249 13032 9049 ...
$ kerry04 : num 0.237 0.225 0.448 0.275 0.183 ...
$ Bush04 : num 0.757 0.764 0.547 0.72 0.809 ...
$ pres04margin : num 0.52 0.539 0.099 0.445 0.625 ...
$ pres04winner : Factor w/ 2 levels "bush","kerry": 1 1 1 1 1 2 1 1 1 1 ...
$ pop06 : int 49730 169162 28171 21482 56436 10906 20520 112903 35176 24863 ...
$ pop00 : int 43911 141423 29044 19939 51229 11613 21330 111392 36574 24059 ...
$ hisp06 : int 827 4176 953 304 3752 752 171 2399 432 273 ...
$ white06 : int 39368 145687 13793 16266 51031 2413 11635 85993 21080 22890 ...
$ black06 : int 8559 16301 13035 4705 872 7615 8511 22178 13311 1358 ...
$ indian06 : int 203 770 115 63 238 38 46 449 55 103 ...
$ asian06 : int 271 679 84 25 125 22 82 801 82 59 ...
$ hawaii06 : int 12 42 6 2 7 1 0 67 0 0 ...
$ mixed06 : int 490 1507 185 117 411 65 75 1016 216 180 ...
$ pct_less_30k : num 0.342 0.358 0.568 0.475 0.415 ...
$ pct_more_100k : num 0.0759 0.0949 0.0503 0.0352 0.0501 ...
$ pct_hs_grad : num 0.787 0.82 0.646 0.632 0.705 ...
$ pct_labor_force : num 0.651 0.598 0.48 0.529 0.606 ...
$ pct_homeowner : num 0.808 0.796 0.732 0.802 0.835 ...
$ unempFeb07 : num 3 3.1 5 3.7 3.1 7.6 5.4 3.7 5.4 3.7 ...
$ unempFeb08 : num 3.6 3.4 7.1 4.2 3.3 7.3 6.4 3.9 8.5 4.3 ...
$ unempChg : num 0.6 0.3 2.1 0.5 0.2 -0.3 1 0.2 3.1 0.6 ...
$ pctUnins00 : num 12.8 13.8 19.4 16.5 15.7 22.9 19.3 15 15.3 15 ...
$ subForPctHomes : num 0.07 0 6.4 0.1 0.04 0 0 0.09 0 0 ...
$ poverty05 : num 10.4 11.4 22.4 16.6 11.4 38.2 23.1 16.9 16.2 16.2 ...
$ median_hhi05 : int 45019 42804 29534 34212 40588 21728 28058 35937 31538 33858 ...
$ Catholic : num 0.03412 0.07465 0.00913 0 0.00737 ...
$ So.Bapt.Conv : num 0.337 0.198 0.261 0.386 0.356 ...
$ Un.Methodist : num 0.0757 0.0705 0.0624 0.0237 0.0712 ...
$ E.L.C.A. : num 0 0.00107 0 0 0 ...
$ Construction : num 6.8 10.56 2.5 20.75 9.75 ...
$ Manufacturing : num 17.58 9.23 45.07 15.72 22.76 ...
$ FinancialActivities: num 5.37 7.92 3.38 3.48 4.3 ...
$ GoodsProducing : num 26.8 21.3 51.3 42.6 34.1 ...
$ ServiceProviding : num 73.2 78.7 48.7 57.4 65.9 ...
primary.sub <- mutate(primary, black06pct=black06/pop06) %>%
filter(state_postal!="MI",
state_postal!="FL",
!(state_postal=="WA" & racetype=="Primary")) %>%
select(county_name, region, winner,
clinton, obama, pct_hs_grad, black06pct)
head(primary.sub)
county_name region winner clinton obama pct_hs_grad black06pct
1 Autauga S obama 1760 2268 0.7872 0.17210939
2 Baldwin S clinton 6259 5450 0.8202 0.09636325
3 Barbour S obama 1322 2393 0.6465 0.46270988
4 Bibb S clinton 922 755 0.6319 0.21902058
5 Blount S clinton 2735 617 0.7045 0.01545113
6 Bullock S obama 471 2032 0.6048 0.69823950
rpart
生成決策樹 + rpart.plot::prp
畫圖fit = rpart(winner~region+pct_hs_grad+black06pct,data=primary.sub)
c1 <- ifelse(fit$frame$yval==1, brewer.pal(9, "Greens")[9], brewer.pal(9, "Blues")[9])
c2 <- ifelse(fit$frame$yval==1, brewer.pal(9, "Greens")[2], brewer.pal(9, "Blues")[2])
prp(fit, type=2, extra=1, col=c1, box.col=c2, shadow.col="gray")