# R = read_csv("data/olist_order_reviews_dataset.csv")
# L = read_csv("data/olist_marketing_qualified_leads_dataset.csv")
# Q = read_csv("data/olist_closed_deals_dataset.csv")
# I = read_csv("data/olist_order_items_dataset.csv")
# C = read_csv("data/olist_customers_dataset.csv")
# O = read_csv("data/olist_orders_dataset.csv")
# P = read_csv("data/olist_products_dataset.csv")
# S = read_csv("data/olist_sellers_dataset.csv")
# TPC = read_csv("data/product_category_name_translation.csv")
#
# attr(R, "spec") = NULL
# attr(L, "spec") = NULL
# attr(Q, "spec") = NULL
# attr(I, "spec") = NULL
# attr(C, "spec") = NULL
# attr(O, "spec") = NULL
# attr(P, "spec") = NULL
# attr(S, "spec") = NULL
# attr(TPC, "spec") = NULL
#
# save(R,L,Q,I,C,O,P,S,TPC, file="data/olist.rdata")
💡 讀完原始資料之後,先將資料壓縮起來,之後再從壓縮檔讀進會比較快、比較方便
Sellers: S$seller_id
[1] TRUE
Products: P$product_id
[1] TRUE
Reviews: R$review_id
[1] FALSE
Reviews: R$review_id
& R$order_id
[1] TRUE
一則評論最多可能談到三個訂單
.
1 2 3
98371 777 25
Order Items: I$order_id
& I$order_item_id
[1] TRUE
🗿 一筆訂單最多會有多少項目(order_item_id
)呢?
# A tibble: 98,666 x 2
order_id n
<chr> <int>
1 8272b63d03f5f79c56e9e4120aec44ef 21
2 1b15974a0141d54e36626dca3fdc731a 20
3 ab14fdcfbe524636d65ee38360e22ce8 20
4 428a2f660dc84138d969ccd69a0ab6d5 15
5 9ef13efd6949e4573a18964dd1bbe7f5 15
6 73c8ab38f07dc94389065f7eba4f297a 14
7 9bdc4d4c71aa1de4606060929dee888c 14
8 37ee401157a3a0b28c9c6d0ed8c3b24b 13
9 2c2a19b5703863c908512d135aa6accc 12
10 3a213fcdfe7d98be74ea0dc05a8b31ae 12
# ... with 98,656 more rows
🗿 一筆訂單最多會有多少個賣家(seller_id
)呢?
.
1 2 3 4 5
97388 1219 54 3 2
# A tibble: 5 x 2
n nn
<int> <int>
1 1 97388
2 2 1219
3 3 54
4 4 3
5 5 2
🗿 一筆訂單最多會包含多少種產品(product_id
)呢?
.
1 2 3 4 5 6 7 8
95430 2846 298 70 8 10 3 1
💡 做分析之前我們一定要先決定分析對象(Unit of Analysis)
假如我們要對所有的賣家做分析,我們就要依selller_id
做彙總; 先從訂單項目(I
)做起
Z = I %>% group_by(seller_id) %>% summarise(
ItemsSold = n(),
Rev = sum(price),
noProd = n_distinct(product_id),
avgPrice = mean(price),
maxPrice = max(price),
minPrice = min(price),
avgFreight = mean(freight_value),
avgRevProd = Rev/noProd,
avgItemsProd = ItemsSold/noProd
)
summary(Z)
seller_id ItemsSold Rev noProd
Length:3095 Min. : 1.0 Min. : 4 Min. : 1.0
Class :character 1st Qu.: 2.0 1st Qu.: 209 1st Qu.: 2.0
Mode :character Median : 8.0 Median : 821 Median : 4.0
Mean : 36.4 Mean : 4391 Mean : 11.1
3rd Qu.: 24.0 3rd Qu.: 3281 3rd Qu.: 10.0
Max. :2033.0 Max. :229473 Max. :399.0
avgPrice maxPrice minPrice avgFreight
Min. : 3.5 Min. : 3.5 Min. : 0.8 Min. : 1.2
1st Qu.: 52.2 1st Qu.: 81.0 1st Qu.: 21.6 1st Qu.: 14.7
Median : 95.5 Median : 169.9 Median : 44.8 Median : 18.2
Mean : 176.3 Mean : 335.4 Mean : 110.4 Mean : 23.4
3rd Qu.: 174.0 3rd Qu.: 349.9 3rd Qu.: 99.0 3rd Qu.: 24.4
Max. :6729.0 Max. :6735.0 Max. :6729.0 Max. :308.3
avgRevProd avgItemsProd
Min. : 3.5 Min. : 1.00
1st Qu.: 89.0 1st Qu.: 1.00
Median : 179.8 Median : 1.67
Mean : 417.5 Mean : 2.67
3rd Qu.: 397.7 3rd Qu.: 2.73
Max. :16983.5 Max. :128.33
計算每一個賣家的評等時要考慮:
這兩個問題。
X = unique(I[,c(1,4)]) %>% left_join(R[,2:3]) %>%
group_by(seller_id) %>% summarise(
noReview = n(),
avgScore = mean(review_score),
minScore = min(review_score),
maxScore = max(review_score))
Joining, by = "order_id"
seller_id noReview avgScore minScore
Length:3095 Min. : 1.0 Min. :1.00 Min. :1.00
Class :character 1st Qu.: 2.0 1st Qu.:3.75 1st Qu.:1.00
Mode :character Median : 7.0 Median :4.17 Median :1.00
Mean : 32.5 Mean :3.98 Mean :2.25
3rd Qu.: 22.0 3rd Qu.:4.60 3rd Qu.:4.00
Max. :1860.0 Max. :5.00 Max. :5.00
maxScore
Min. :1.00
1st Qu.:5.00
Median :5.00
Mean :4.69
3rd Qu.:5.00
Max. :5.00
最後併入賣家的基本資料
Joining, by = "seller_id"
Joining, by = "seller_id"
seller_id ItemsSold Rev
0 0 0
noProd avgPrice maxPrice
0 0 0
minPrice avgFreight avgRevProd
0 0 0
avgItemsProd noReview avgScore
0 0 0
minScore maxScore seller_zip_code_prefix
0 0 0
seller_city seller_state
0 0
做一個簡單的分析: 賣家的平均評等最高的20個省份是 …
SE RO BA SP RJ MA PR DF MG RS SC
3.8500 3.8542 3.9316 3.9541 3.9657 3.9720 4.0155 4.0319 4.0336 4.0501 4.0725
PI GO PE ES MT MS PB RN PA
4.0833 4.1011 4.1091 4.1959 4.2231 4.2510 4.2812 4.4757 4.5000
Funnel
資料集裡面有更多賣家欄位,但是它的資料筆數不多
Joining, by = "mql_id"
Joining, by = "seller_id"
[1] 380 33
依賣家的行為做彙總
behavior = group_by(Z1, lead_behaviour_profile) %>% summarise(
noSellers = n(),
totalRev = sum(Rev),
avgRev = mean(Rev),
avgReview = mean(noReview),
avgScore = mean(avgScore)
) %>% arrange( desc(avgScore) )
behavior
# A tibble: 7 x 6
lead_behaviour_profile noSellers totalRev avgRev avgReview avgScore
<chr> <int> <dbl> <dbl> <dbl> <dbl>
1 cat, wolf 1 59 59 1 5
2 wolf 43 21378. 497. 4.60 4.39
3 <NA> 91 231802. 2547. 17.0 4.33
4 eagle 50 90725. 1814. 13.5 4.28
5 cat 184 282479. 1535. 9.64 4.25
6 shark 10 49512. 4951. 32.3 4.23
7 eagle, cat 1 896 896 3 3.33
依產業別做彙總
segment = group_by(Z1, business_segment) %>% summarise(
noSellers = n(),
totalRev = sum(Rev),
avgItemsSold = mean(ItemsSold),
avgPrice = totalRev/avgItemsSold,
avgRev = mean(Rev),
avgReview = mean(noReview),
avgScore = mean(avgScore)
) %>% arrange( desc(totalRev) )
segment
# A tibble: 29 x 8
business_segment noSellers totalRev avgItemsSold avgPrice avgRev avgReview
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 watches 3 117279. 198 592. 39093. 193.
2 health_beauty 45 90836. 18.2 4979. 2019. 15.7
3 household_utili~ 44 51569. 12.8 4045. 1172. 11.4
4 audio_video_ele~ 31 50265. 8.87 5666. 1621. 8.03
5 small_appliances 7 47675. 9.86 4837. 6811. 9.71
6 home_decor 44 44102. 10.3 4293. 1002. 8.23
7 pet 17 40499. 16.8 2407. 2382. 14.6
8 construction_to~ 32 34539. 9.94 3476. 1079. 8.5
9 car_accessories 30 30175. 5.47 5520. 1006. 4.93
10 home_appliances 5 26241. 28.2 931. 5248. 27
# ... with 19 more rows, and 1 more variable: avgScore <dbl>
儲存資料物件
🧙 問題討論:
請大家參考一下上學期的期中競賽影片集,
假如老闆給妳這一份資料,讓你透過資料分析提出一些對公司營運有幫助的建議,
請妳用我們已經學過的資料分析技巧先做一些資料探索,比方說:
■ 找出營收貢獻最大的賣家和產品
■ 研究一下哪幾個州(customer_state
)產生的營收最多
■ 各州或者各品類(segment
)的營收貢獻有什麼長期趨勢嗎?
■ 顧客下訂單的時間(order_purchase_timestamp
)有什麼特別的樣態嗎?
■ 我們可以針對顧客滿意(review_score
)做分析嗎?
■ 有哪些因素可能會提高或降低顧客滿意呢?
■ …
做過簡單的資料探索之後,你打算從哪一個方向著手呢?