options(scipen=10, digits=3)
rm(list=ls(all=TRUE))
pacman::p_load(dplyr, ggplot2, plotly, arules, arulesViz)
load("data/tf0.rdata")
table(Z0$prod) %>% sort %>% tail(10) %>% names -> prod10
Z0 %>% filter(age %in% c('a39','a49'), prod %in% prod10) %>% 
  group_by(prod, age, date) %>% summarise(
    t.qty = sum(qty),
    u.price = sum(price)/t.qty 
  ) %>% 
  ggplot(aes(x=u.price,y=t.qty,col=age)) +
  geom_smooth(method='lm',se=F) +
  facet_wrap(~prod,scales="free") + theme_bw()
## `summarise()` has grouped output by 'prod', 'age'. You can override using the `.groups` argument.
## `geom_smooth()` using formula 'y ~ x'

summarise_at(Z0, vars(cust,cat,prod), n_distinct)
##    cust  cat  prod
## 1 32256 2007 23789
mx = xtabs(~cust+prod, Z0, sparse=T)
mx@x = rep(1, length(mx@x))
mx = mx[,order(-colSums(mx))]
sum(colSums(mx) > 200)
## [1] 634
mx = mx[rownames(mx) %in% A0$cust,]
identical(rownames(mx), A0$cust)
## [1] TRUE
N = 100
px = data.frame(
  pid = rownames(mx)[1:N],
  size = colSums(mx)[1:N],
  rev = apply(mx[,1:N], 2, function(v) mean(A0$rev[v > 0])),
  raw = apply(mx[,1:N], 2, function(v) mean(A0$raw[v > 0])),
  margin = apply(mx[,1:N], 2, function(v) sum(A0$raw[v>0])/sum(A0$rev[v>0]) )
  )
summary(px)
##      pid                 size           rev            raw      
##  Length:100         Min.   : 592   Min.   :4992   Min.   : 677  
##  Class :character   1st Qu.: 708   1st Qu.:6260   1st Qu.: 895  
##  Mode  :character   Median : 819   Median :6858   Median :1023  
##                     Mean   : 999   Mean   :6903   Mean   :1027  
##                     3rd Qu.:1026   3rd Qu.:7502   3rd Qu.:1141  
##                     Max.   :6025   Max.   :9159   Max.   :1486  
##      margin     
##  Min.   :0.127  
##  1st Qu.:0.141  
##  Median :0.149  
##  Mean   :0.148  
##  3rd Qu.:0.155  
##  Max.   :0.171
ggplot(px, aes(x=rev, y=margin, text=pid)) + 
  geom_point(aes(size=size), alpha=0.4, col='brown') +
  theme_bw() -> p
ggplotly(p)
Z0 %>% filter(prod %in% colnames(mx)[1:500]) %>% 
  mutate(uprice = price/qty) %>% 
  group_by(prod) %>% summarise(
    noPrice= n_distinct(uprice),
    maxPrice = max(uprice),
    minPrice = min(uprice),
    avgPrice = sum(price)/sum(qty),
    totalQty = sum(qty),
    noOrders = n()
    ) %>% arrange(desc(noOrders)) %>% View
par(cex=0.6)
Z0 %>% filter(prod == '4714981010038') %>% 
  mutate(uprice = price/qty) %>% 
  count(uprice) %>% 
  {barplot(.$n, name=round(.$uprice,2), las=2)}

A0$margin = A0$raw/A0$rev
summary(A0$margin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -1.083   0.097   0.155   0.130   0.199   0.585
hist(A0$margin)

tapply(A0$margin, A0$age, mean) %>% barplot(las=2)

table(loss=A0$margin < 0, large=A0$rev > mean(A0$rev) ) %>% prop.table %>% {round(100*.,1)}
##        large
## loss    FALSE TRUE
##   FALSE  60.3 31.5
##   TRUE    7.8  0.3
mutate(A0, Loss = margin < 0) %>%
  ggplot(aes(x=rev, fill=Loss)) + 
  geom_density(alpha=0.3) + 
  scale_x_log10() + theme_bw()