準備工作

安裝/載入必要套件

# 請先安裝套件
install.packages("rpart") # 安裝 `reshape2` 套件
install.packages("rpart.plot")
install.packages("RColorBrewer")
install.packages("ggdendro")

# 載入套件
library(rpart) # Recursive partitioning
library(rpart.plot) # Fancy tree plot
library(RColorBrewer) # Nice color palettes
library(ggdendro)

讀取檔案

The Obama-Clinton Divide Data

primary <- read.csv("https://johnsonhsieh.github.io/DSC2016-R/data/primaries.csv")
str(primary) 
'data.frame':   2450 obs. of  50 variables:
 $ fips               : int  1001 1003 1005 1007 1009 1011 1013 1015 1017 1019 ...
 $ county_name        : Factor w/ 1425 levels "Abbeville","Acadia",..: 70 76 84 123 134 176 183 192 228 247 ...
 $ state_postal       : Factor w/ 37 levels "AK","AL","AR",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ region             : Factor w/ 4 levels "MW","NE","S",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ election_date      : Factor w/ 13 levels "1/15/08","1/19/08",..: 9 9 9 9 9 9 9 9 9 9 ...
 $ racetype           : Factor w/ 2 levels "Caucus","Primary": 2 2 2 2 2 2 2 2 2 2 ...
 $ tvotes             : int  4118 12085 3823 1751 3471 2540 2775 11699 3958 2689 ...
 $ clinton            : int  1760 6259 1322 922 2735 471 1143 5942 1448 2040 ...
 $ obama              : int  2268 5450 2393 755 617 2032 1557 5471 2409 485 ...
 $ edwards            : int  57 227 60 44 75 6 40 154 54 105 ...
 $ margin             : num  0.1234 -0.0669 0.2801 -0.0954 -0.6102 ...
 $ winner             : Factor w/ 2 levels "clinton","obama": 2 1 2 1 1 2 2 1 2 1 ...
 $ POP05_SQMI         : num  78.9 97 32.3 33.4 80.7 ...
 $ popUnder30_00      : num  30.3 27.2 31.4 32.8 30.2 31.6 26.4 30.5 28.5 27.8 ...
 $ pop65up_00         : num  10.2 15.5 13.3 11.6 12.9 13.2 16.4 14.1 16.2 15.9 ...
 $ presVote04         : int  20081 69320 10777 7600 21504 4717 8416 45249 13032 9049 ...
 $ kerry04            : num  0.237 0.225 0.448 0.275 0.183 ...
 $ Bush04             : num  0.757 0.764 0.547 0.72 0.809 ...
 $ pres04margin       : num  0.52 0.539 0.099 0.445 0.625 ...
 $ pres04winner       : Factor w/ 2 levels "bush","kerry": 1 1 1 1 1 2 1 1 1 1 ...
 $ pop06              : int  49730 169162 28171 21482 56436 10906 20520 112903 35176 24863 ...
 $ pop00              : int  43911 141423 29044 19939 51229 11613 21330 111392 36574 24059 ...
 $ hisp06             : int  827 4176 953 304 3752 752 171 2399 432 273 ...
 $ white06            : int  39368 145687 13793 16266 51031 2413 11635 85993 21080 22890 ...
 $ black06            : int  8559 16301 13035 4705 872 7615 8511 22178 13311 1358 ...
 $ indian06           : int  203 770 115 63 238 38 46 449 55 103 ...
 $ asian06            : int  271 679 84 25 125 22 82 801 82 59 ...
 $ hawaii06           : int  12 42 6 2 7 1 0 67 0 0 ...
 $ mixed06            : int  490 1507 185 117 411 65 75 1016 216 180 ...
 $ pct_less_30k       : num  0.342 0.358 0.568 0.475 0.415 ...
 $ pct_more_100k      : num  0.0759 0.0949 0.0503 0.0352 0.0501 ...
 $ pct_hs_grad        : num  0.787 0.82 0.646 0.632 0.705 ...
 $ pct_labor_force    : num  0.651 0.598 0.48 0.529 0.606 ...
 $ pct_homeowner      : num  0.808 0.796 0.732 0.802 0.835 ...
 $ unempFeb07         : num  3 3.1 5 3.7 3.1 7.6 5.4 3.7 5.4 3.7 ...
 $ unempFeb08         : num  3.6 3.4 7.1 4.2 3.3 7.3 6.4 3.9 8.5 4.3 ...
 $ unempChg           : num  0.6 0.3 2.1 0.5 0.2 -0.3 1 0.2 3.1 0.6 ...
 $ pctUnins00         : num  12.8 13.8 19.4 16.5 15.7 22.9 19.3 15 15.3 15 ...
 $ subForPctHomes     : num  0.07 0 6.4 0.1 0.04 0 0 0.09 0 0 ...
 $ poverty05          : num  10.4 11.4 22.4 16.6 11.4 38.2 23.1 16.9 16.2 16.2 ...
 $ median_hhi05       : int  45019 42804 29534 34212 40588 21728 28058 35937 31538 33858 ...
 $ Catholic           : num  0.03412 0.07465 0.00913 0 0.00737 ...
 $ So.Bapt.Conv       : num  0.337 0.198 0.261 0.386 0.356 ...
 $ Un.Methodist       : num  0.0757 0.0705 0.0624 0.0237 0.0712 ...
 $ E.L.C.A.           : num  0 0.00107 0 0 0 ...
 $ Construction       : num  6.8 10.56 2.5 20.75 9.75 ...
 $ Manufacturing      : num  17.58 9.23 45.07 15.72 22.76 ...
 $ FinancialActivities: num  5.37 7.92 3.38 3.48 4.3 ...
 $ GoodsProducing     : num  26.8 21.3 51.3 42.6 34.1 ...
 $ ServiceProviding   : num  73.2 78.7 48.7 57.4 65.9 ...

資料整理

primary.sub <- mutate(primary, black06pct=black06/pop06) %>%
  filter(state_postal!="MI", 
         state_postal!="FL", 
         !(state_postal=="WA" & racetype=="Primary")) %>%
  select(county_name, region, winner,
         clinton, obama, pct_hs_grad, black06pct)

head(primary.sub)
  county_name region  winner clinton obama pct_hs_grad black06pct
1     Autauga      S   obama    1760  2268      0.7872 0.17210939
2     Baldwin      S clinton    6259  5450      0.8202 0.09636325
3     Barbour      S   obama    1322  2393      0.6465 0.46270988
4        Bibb      S clinton     922   755      0.6319 0.21902058
5      Blount      S clinton    2735   617      0.7045 0.01545113
6     Bullock      S   obama     471  2032      0.6048 0.69823950

rpart生成決策樹 + rpart.plot::prp畫圖

fit = rpart(winner~region+pct_hs_grad+black06pct,data=primary.sub)
c1 <- ifelse(fit$frame$yval==1, brewer.pal(9, "Greens")[9], brewer.pal(9, "Blues")[9])
c2 <- ifelse(fit$frame$yval==1, brewer.pal(9, "Greens")[2], brewer.pal(9, "Blues")[2])
prp(fit, type=2, extra=1, col=c1, box.col=c2, shadow.col="gray")