圖一、模型、預測與決策
packages = c(
"dplyr","ggplot2","caTools","ROCR","rpart","rpart.plot",
"caret","doParallel","manipulate")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)])
install.packages(pkg)
rm(list=ls(all=T))
options(digits=4)
library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
library(ROCR)
library(manipulate)
load("clv.rdata")
CX = left_join(Y$Y2014, Y$Y2015[,c(1,8,9)], by="cid")
names(CX)[8:11] = c("freq0","revenue0","Retain", "Revenue")
CX$Retain = CX$Retain > 0
table(CX$Retain) %>% prop.table() # 22.54%, baseline_acc. = 77.46%
##
## FALSE TRUE
## 0.7701 0.2299
str(CX)
## 'data.frame': 16905 obs. of 11 variables:
## $ cid : int 10 80 90 120 130 160 190 220 230 240 ...
## $ recent : num 3464 302 393 1036 2605 ...
## $ freq : int 1 6 10 1 2 2 5 2 1 4 ...
## $ money : num 30 70 116 20 50 ...
## $ senior : num 3464 3386 3418 1036 3345 ...
## $ status : Factor w/ 7 levels "N1","N2","R1",..: 7 3 4 1 7 7 6 6 7 3 ...
## $ since : Date, format: "2005-07-08" "2005-09-24" ...
## $ freq0 : int 0 1 0 0 0 0 0 0 0 1 ...
## $ revenue0: num 0 80 0 0 0 0 0 0 0 20 ...
## $ Retain : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ Revenue : num 0 80 0 0 0 0 0 0 0 0 ...
其實模型本身沒有準確性;我們拿一個模型來預測一份資料,然後比較預測值跟實際值,之後才會知道這個模型在這一份資料上的準確性。
預測性模型的準確性:
對哪一份資料的預測?
哪一種準確性?
為了測量模型的準確性,通常我們拿到一份資料,會先把它分割成 訓練資料 跟 驗證資料 …
set.seed(3000)
spl = sample.split(CX$Retain, SplitRatio = 0.75)
test = subset(CX, !spl)
train = subset(CX, spl)
我們拿 訓練資料 來訓練模型
m1 = glm(Retain ~ recent+freq+senior+status+freq0+revenue0,
train, family=binomial())
summary(m1)
##
## Call:
## glm(formula = Retain ~ recent + freq + senior + status + freq0 +
## revenue0, family = binomial(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.571 -0.473 -0.295 -0.158 3.288
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.21e+00 1.04e-01 -11.72 < 2e-16 ***
## recent -2.01e-03 1.53e-04 -13.09 < 2e-16 ***
## freq 7.57e-02 2.40e-02 3.16 0.0016 **
## senior 1.57e-04 7.72e-05 2.03 0.0420 *
## statusN2 6.99e-01 8.19e-02 8.54 < 2e-16 ***
## statusR1 3.37e-01 1.43e-01 2.36 0.0184 *
## statusR2 1.18e+00 1.36e-01 8.73 < 2e-16 ***
## statusS1 3.97e-01 1.82e-01 2.18 0.0293 *
## statusS2 1.05e+00 2.52e-01 4.16 3.2e-05 ***
## statusS3 2.29e+00 3.30e-01 6.95 3.6e-12 ***
## freq0 6.28e-01 7.62e-02 8.25 < 2e-16 ***
## revenue0 -1.28e-04 1.48e-04 -0.87 0.3866
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13669.2 on 12677 degrees of freedom
## Residual deviance: 8779.7 on 12666 degrees of freedom
## AIC: 8804
##
## Number of Fisher Scoring iterations: 6
訓練的過程之中,我們可以算出模型在訓練資料之中的正確性
pred1 = predict(m1,type="response")
table(pred1>0.5,train$Retain)
##
## FALSE TRUE
## FALSE 9024 1130
## TRUE 740 1784
cm = table(pred1>0.5,train$Retain) # confusion matrix
sum(diag(cm)) / sum(cm) # TRAIN ACCURACY: 85.25%
## [1] 0.8525
但是,我們一般不關心訓練的正確性;我們真正重視的,通常是模型對驗證資料的正確性
pred1 = predict(m1,test,type="response")
(cm = table(pred1>0.5,test$Retain))
##
## FALSE TRUE
## FALSE 2999 372
## TRUE 256 600
sum(diag(cm)) / sum(cm) # TEST ACCURACY: 85.14%
## [1] 0.8514
在複雜的預測性模型裡面,模型在訓練資料之中的準確性很容易被高估;基本上,模型越複雜,它預測訓練資料就會越準;但是,當模型太複雜、太像樣本的時候,模型就會失去它的 一般性 ,我們用它來預測樣本以外的資料的時候,它的準確性就會變差。我們可以使用決策樹模型來示範這種 過度適配 (over fitting) 的現象:
我們可以藉由調整cp
這個參數,來調整決策樹模型的複雜度,
library(rpart)
library(rpart.plot)
m2 = rpart(Retain ~ recent+freq+senior+status+freq0+revenue0,
method='class', train, cp = 5e-2)
prp(m2)
cp
越小,模型的複雜度就越高 …
m3 = rpart(Retain ~ recent+freq+senior+status+freq0+revenue0,
method='class', train, cp = 5e-3)
prp(m3)
m4 = rpart(Retain ~ recent+freq+senior+status+freq0+revenue0,
method='class', train, cp = 5e-4)
prp(m4)
m5 = rpart(Retain ~ recent+freq+senior+status+freq0+revenue0,
method='class', train, cp = 5e-5)
# prp(m5)
acc = t(sapply(list(m2,m3,m4,m5), function(m) { c(
train_acc = predict(m)[,2] %>%
{table(.>0.5, train$Retain)} %>%
{sum(diag(.)) / sum(.)},
test_acc = predict(m,test)[,2] %>%
{table(.>0.5, test$Retain)} %>%
{sum(diag(.)) / sum(.)}
) } ) ); acc
## train_acc test_acc
## [1,] 0.8531 0.8429
## [2,] 0.8593 0.8547
## [3,] 0.8779 0.8519
## [4,] 0.8868 0.8349
當我們漸漸調高複雜度的時候,一開始訓練正確性和驗證正確性會一起變高,但是當複雜度高到某一個程度之後,訓練正確性繼續提高的同時,驗證正確性就會開始下降。
matplot(acc, type='l', lwd=2, lty=1,
main = "Complexity vs. Accuracy",
xlab="model complexity", ylab="model accuracy")
legend("topleft",c("training_accuracy","testing_accuracy"),
lwd=2, col=1:2)
所以,在訓練機器學習的模型的時候,我們需要調整模型的複雜度,才能夠找到最佳的驗證正確性。
在機器學習的過程中,我們藉由調整模型參數,來改變模型的複雜度 (參數調校),藉以找到 交叉驗證正確性 最高的模型。
參數調校和交叉驗證流程其實頗為繁複,(請參考 監督式學習流程 ),還好caret
套件裡面有自動化的工具幫我們跑這一個流程
library(caret)
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
## [1] 4
train$Retain = factor(ifelse(train$Retain, "Y", "N"))
ctrl = trainControl(
method="repeatedcv", number=10, repeats=10,
savePredictions = "final",
classProbs=TRUE,
summaryFunction=twoClassSummary)
t0 = Sys.time()
set.seed(2)
cv.rpart = train(
Retain ~ recent+freq+senior+status+freq0+revenue0,
data=train, method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid( cp = seq(0.0001,0.0003,0.00002) )
)
cv.rpart; plot(cv.rpart)
## CART
##
## 12678 samples
## 6 predictor
## 2 classes: 'N', 'Y'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 11410, 11410, 11409, 11411, 11411, 11410, ...
## Resampling results across tuning parameters:
##
## cp ROC Sens Spec
## 0.00010 0.8657 0.9198 0.5817
## 0.00012 0.8659 0.9199 0.5824
## 0.00014 0.8663 0.9209 0.5833
## 0.00016 0.8661 0.9210 0.5855
## 0.00018 0.8663 0.9212 0.5863
## 0.00020 0.8671 0.9229 0.5895
## 0.00022 0.8652 0.9232 0.5894
## 0.00024 0.8634 0.9238 0.5903
## 0.00026 0.8620 0.9247 0.5920
## 0.00028 0.8601 0.9250 0.5924
## 0.00030 0.8585 0.9262 0.5929
##
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was cp = 2e-04.
Sys.time() - t0 # Time difference of 31.085 secs
## Time difference of 26.31 secs
我們藉由機器學習流程找出最佳的模型參數(cp = 0.0002
)之後,再使用這個模型參數來建立我們最後的模型。
m3 = rpart(Retain ~ recent+freq+senior+status+freq0+revenue0,
method='class', train, cp = 0.0002)
做商業數據分析的時候,我們會藉由調整臨界機率來最大化我們的期望報酬,模型的預測(臨界機率 = 50%)通常不等於我們的商業決策 …
圖二、預測機率分布與混淆矩陣
我們使用預測性模型來做決策的時候,通常模型的 辨識率(AUC) 比它的 正確性(ACC) 還要重要
pred = predict(m3)[,2]
(cm = table(pred>0.5, train$Retain))
##
## N Y
## FALSE 9225 921
## TRUE 539 1993
sum(diag(cm)) / sum(cm) # TRAIN ACC: .8848
## [1] 0.8848
colAUC(pred,train$Retain) # TRAIN AUC: .8947
## [,1]
## N vs. Y 0.8947
pred = predict(m3, test)[,2]
(cm = table(pred>0.5, test$Retain))
##
## FALSE TRUE
## FALSE 2976 386
## TRUE 279 586
sum(diag(cm)) / sum(cm) # TEST ACC: .8427
## [1] 0.8427
colAUC(pred,test$Retain) # TEST AUC: .8593
## [,1]
## FALSE vs. TRUE 0.8593
library(ROCR)
prediction(pred, test$Retain) %>% # Confusion Matrix
performance("tpr", "fpr") %>%
plot(print.cutoffs.at=seq(0,1,0.1))
圖三、顧客價值管理流程
使用類別模型來做商業決策的時候,不同的 臨界機率 會產生不一樣的 複雜矩陣;
給定報酬矩陣的時候,不同的 複雜矩陣 會產生不一樣的 期望報酬
cutoff = 0.5
cm = table(pred>cutoff, test$Retain)
payoff = matrix(c(0, -10, -30, 100),nrow=2)
sum(cm * payoff)
## [1] 44230
給定模型和報酬矩陣,我們可以藉由調整 臨界機率 得到最大的 期望報酬
payoff = matrix(c(10, -30, -10, 100),nrow=2)
dx = t(sapply(seq(.05,.95,.025), function(cx) {
cm = table(pred>cx, test$Retain)
c(cutoff = cx, payoff = sum(cm * payoff))
}))
i = which.max(dx[,2])
plot(dx, type='l',main=sprintf(
"Optimal (Cutoff, Payoff) = (%.2f, %d)", dx[i,1], dx[i,2]))
abline(v=seq(0,1,0.1), h=seq(-300000, 300000, 10000),
col='lightgray', lty=2)
abline(v = dx[i,1], col='green')