Remove data after the demarcation date
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
## tid date cust age
## Min. : 1 Min. :2000-11-01 Length:88387 Length:88387
## 1st Qu.:22098 1st Qu.:2000-11-23 Class :character Class :character
## Median :44194 Median :2000-12-12 Mode :character Mode :character
## Mean :44194 Mean :2000-12-15
## 3rd Qu.:66291 3rd Qu.:2001-01-12
## Max. :88387 Max. :2001-01-31
## area items pieces total
## Length:88387 Min. : 1.000 Min. : 1.000 Min. : 5.0
## Class :character 1st Qu.: 2.000 1st Qu.: 3.000 1st Qu.: 230.0
## Mode :character Median : 5.000 Median : 6.000 Median : 522.0
## Mean : 6.994 Mean : 9.453 Mean : 888.7
## 3rd Qu.: 9.000 3rd Qu.: 12.000 3rd Qu.: 1120.0
## Max. :112.000 Max. :339.000 Max. :30171.0
## gross
## Min. :-1645.0
## 1st Qu.: 23.0
## Median : 72.0
## Mean : 138.3
## 3rd Qu.: 174.0
## Max. : 8069.0
## items pieces total gross
## 99.9% 56.0000 84.0000 9378.684 1883.228
## 99.95% 64.0000 98.0000 11261.751 2317.087
## 99.99% 85.6456 137.6456 17699.325 3389.646
d0 = max(X$date) + 1
A = X %>% mutate(
days = as.integer(difftime(d0, date, units="days"))
) %>%
group_by(cust) %>% summarise(
r = min(days), # recency
s = max(days), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = age[1], # age group
area = area[1], # area code
) %>% data.frame # 28584
nrow(A)
## [1] 28584
A$buy
##
## FALSE TRUE
## FALSE 15342 0
## TRUE 0 13242
## cust r s f
## Length:28584 Min. : 1.00 Min. : 1.00 Min. : 1.000
## Class :character 1st Qu.:11.00 1st Qu.:47.00 1st Qu.: 1.000
## Mode :character Median :21.00 Median :68.00 Median : 2.000
## Mean :32.12 Mean :61.27 Mean : 3.089
## 3rd Qu.:53.00 3rd Qu.:83.00 3rd Qu.: 4.000
## Max. :92.00 Max. :92.00 Max. :60.000
##
## m rev raw age
## Min. : 8.0 Min. : 8 Min. : -742.0 Length:28584
## 1st Qu.: 359.4 1st Qu.: 638 1st Qu.: 70.0 Class :character
## Median : 709.5 Median : 1566 Median : 218.0 Mode :character
## Mean : 1012.4 Mean : 2711 Mean : 420.8
## 3rd Qu.: 1315.0 3rd Qu.: 3426 3rd Qu.: 535.0
## Max. :10634.0 Max. :99597 Max. :15565.0
##
## area amount buy
## Length:28584 Min. : 8 Mode :logical
## Class :character 1st Qu.: 454 FALSE:15342
## Mode :character Median : 993 TRUE :13242
## Mean : 1499
## 3rd Qu.: 1955
## Max. :28089
## NA's :15342
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
## [1] 28584 20008 8576
cbind(A, spl) %>% filter(buy) %>%
ggplot(aes(x=log(amount))) + geom_density(aes(fill=spl), alpha=0.5)
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
## [1] 13242 9269 3973