影片連結:(ctrl + click)
li = fread("data/賴清德_articleMetaData.csv",encoding="UTF-8")
li_r = fread("data/賴清德_articleMetaData_response.csv",encoding="UTF-8")
tsai = fread("data/蔡英文_articleMetaData.csv",encoding="UTF-8")
tsai_r = fread("data/蔡英文_articleMetaData_response.csv",encoding="UTF-8")
zhu1 = read_csv("data/朱立倫_articleMetaData_response.csv")
kp = fread("data/柯文哲_articleMetaData.csv",encoding="UTF-8")
kp_r = fread("data/柯文哲_articleMetaData_response.csv",encoding="UTF-8")
kuo = fread("data/郭台銘_articleMetaData.csv",encoding="UTF-8")
kuo_r = fread("data/郭台銘_articleMetaData_response.csv",encoding="UTF-8")
# 選出需要的欄位
li_r <- li_r[,c(4,7,8,10)]
colnames(li_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1830
# 把評論和文章依據artUrl innerJoin起來
li_all <- merge(x = li, y = li_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- li_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)
因網路評論人數眾多,我們去設定發文數跟回文數達一定數量才會列入圖片。 篩發文數可以代表那個發文者是不是高度關注該參選人並熱衷於分享;篩回文數可以代表那個發文者的文章能夠一定引起共鳴。
## 6009 272
li_poster=table(li$artPoster) %>% sort %>% as.data.frame
li_poster=li_poster %>% filter(freq>=5)
link <- li_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==li_poster$artPoster) %>% #發文次數>5次
#filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
li_leader2$artDate = as.Date(li_leader2$artDate)
li_leader2= li_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader2time = li_leader2 %>%group_by(months) %>%
summarise(num=n()) %>%
mutate( poster ="youhow0418" )
li_leader3 = li %>% filter(artPoster=="luke7212")
## artTitle artDate artTime
## Length:16 Length:16 Length:16
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:16 Length:16 Length:16 Min. : 24.0
## Class :character Class :character Class :character 1st Qu.: 56.5
## Mode :character Mode :character Mode :character Median : 169.0
## Mean : 359.1
## 3rd Qu.: 645.8
## Max. :1056.0
## push boo sentence
## Min. : 11.00 Min. : 1.00 Length:16
## 1st Qu.: 19.75 1st Qu.: 14.75 Class :character
## Median :101.00 Median : 27.50 Mode :character
## Mean :226.81 Mean : 60.75
## 3rd Qu.:412.25 3rd Qu.: 73.25
## Max. :711.00 Max. :335.00
li_leader3$artDate = as.Date(li_leader3$artDate)
li_leader3= li_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
li_leader3time = li_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
mutate( poster ="luke7212" )
# 整合他們的發文趨勢圖
li_leader = rbind(li_leader1time,li_leader2time,li_leader3time)
li_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "fixed")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(li_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(li_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(li_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
## [1] TRUE
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
li_3 = li %>% filter(artPoster=="shared"|artPoster=="youhow0418"|artPoster=="luke7212")
devotion_bigram <- li_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
shared 文章攻擊性較強,較常出現情緒性字眼,以爭議性[新聞]、[爆卦]為主,大比例分享民調動態,也因發文量大,情緒分布較為密集,單篇正負情緒也較極端。
youhow0418 和shared相似,分享內容情緒強烈,特別是抨擊蔡英文施政的部分,也因多為爭議性新聞而正負面情緒都相當高。
luke7212 發文量少,情緒分布較為分散,多為新聞或臉書正向肯定的內容居多,偶爾提到負面情緒字眼,以正面情緒為主。
# 選出需要的欄位
tsai_r <- tsai_r[,c(4,7,8,10)]
colnames(tsai_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1590
# 把評論和文章依據artUrl innerJoin起來
tsai_all <- merge(x = tsai, y = tsai_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- tsai_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)
## 4546 250
tsai_poster=table(tsai$artPoster) %>% sort %>% as.data.frame
tsai_poster=tsai_poster %>% filter(freq>=5)
link <- tsai_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==tsai_poster$artPoster) %>% #發文次數>=5次
# filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
tsai_leader1$artDate = as.Date(tsai_leader1$artDate)
tsai_leader1= tsai_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader1time = tsai_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="Wojnarowski" )
# 二號(這5個月發了44次文)
tsai_leader2 = tsai %>% filter(artPoster=="cheinshin")
## artTitle artDate artTime
## Length:44 Length:44 Length:44
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:44 Length:44 Length:44 Min. : 13.0
## Class :character Class :character Class :character 1st Qu.: 71.5
## Mode :character Mode :character Mode :character Median :274.0
## Mean :317.4
## 3rd Qu.:472.8
## Max. :927.0
## push boo sentence
## Min. : 3.00 Min. : 2.00 Length:44
## 1st Qu.: 32.75 1st Qu.: 16.25 Class :character
## Median :198.50 Median : 33.00 Mode :character
## Mean :202.61 Mean : 52.34
## 3rd Qu.:295.50 3rd Qu.: 68.75
## Max. :657.00 Max. :284.00
tsai_leader2$artDate = as.Date(tsai_leader2$artDate)
tsai_leader2= tsai_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader2time = tsai_leader2 %>%group_by(months) %>%
summarise( num=n()) %>%
mutate( poster ="cheinshin" )
tsai_leader3 = tsai %>% filter(artPoster=="TWOOOOOOOOOO")
## artTitle artDate artTime
## Length:27 Length:27 Length:27
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:27 Length:27 Length:27 Min. : 18.0
## Class :character Class :character Class :character 1st Qu.: 38.0
## Mode :character Mode :character Mode :character Median : 81.0
## Mean : 194.8
## 3rd Qu.: 177.5
## Max. :1099.0
## push boo sentence
## Min. : 3.0 Min. : 1.00 Length:27
## 1st Qu.: 14.5 1st Qu.: 10.50 Class :character
## Median : 21.0 Median : 24.00 Mode :character
## Mean :102.1 Mean : 36.48
## 3rd Qu.: 68.5 3rd Qu.: 45.00
## Max. :745.0 Max. :188.00
tsai_leader3$artDate = as.Date(tsai_leader3$artDate)
tsai_leader3= tsai_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
tsai_leader3time = tsai_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
mutate( poster ="TWOOOOOOOOOO" )
# 整合他們的發文趨勢圖
tsai_leader = rbind(tsai_leader1time,tsai_leader2time,tsai_leader3time)
tsai_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(tsai_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(tsai_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict.txt", stop_word = "dict/stop_word.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
Wojnarowski(96):[新聞]分享者 大部分是新聞報導,或者是轉貼蔡英文的臉書貼文居多,分享內容多元。包含蔡英文的外交政策(反對一國兩制、友邦交流)、施政成效上的宣傳、探訪國軍等視察活動,對韓國瑜政策的批評,甚至還有到嘉義吃虎兒油飯等較為輕鬆的臉書轉貼。
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
tsai_3 = tsai %>% filter(artPoster=="Wojnarowski"|artPoster=="cheinshin"|artPoster=="TWOOOOOOOOOO")
devotion_bigram <- tsai_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
han1 <- han1 %>%
select(artUrl, commentPoster, commentStatus, commentContent)
length(unique(han$artPoster))# 發文者數量2538
# 整理所有出現過得使用者
# 如果它曾發過文的話就標註他爲poster
# 如果沒有發過文的話則標註他爲replyer
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%han$artPoster, "poster", "replyer"))
# 把評論和文章依據artUrl innerJoin起來
han_all <- merge(x = han, y = han1, by = "artUrl")
## 9543 230
han_poster=table(han$artPoster) %>% sort %>% as.data.frame
han_poster=han_poster %>% filter(freq>=10)
han_link <- han_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==han_poster$artPoster) %>% #發文次數>10次
filter(commentStatus!="→") %>% # ptt篩出推噓
select(commentPoster, artPoster, artUrl, commentStatus)
han_leader1$artDate = as.Date(han_leader1$artDate)
han_leader1= han_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader1time = han_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="linhu8883324" )
# 二號(這五個月發了10次文)
han_leader2 = han %>% filter(artPoster=="Aptantion")
## artTitle artDate artTime
## Length:10 Min. :2019-01-04 Length:10
## Class :character 1st Qu.:2019-02-16 Class1:hms
## Mode :character Median :2019-03-14 Class2:difftime
## Mean :2019-03-14 Mode :numeric
## 3rd Qu.:2019-04-01
## Max. :2019-05-29
## artUrl artPoster artCat commentNum
## Length:10 Length:10 Length:10 Min. : 361.0
## Class :character Class :character Class :character 1st Qu.: 553.5
## Mode :character Mode :character Mode :character Median : 664.0
## Mean : 695.8
## 3rd Qu.: 768.5
## Max. :1216.0
## push boo sentence
## Min. :302.0 Min. : 19.0 Length:10
## 1st Qu.:424.0 1st Qu.: 35.5 Class :character
## Median :473.5 Median : 54.0 Mode :character
## Mean :491.4 Mean : 73.3
## 3rd Qu.:573.5 3rd Qu.: 71.0
## Max. :752.0 Max. :261.0
han_leader2$artDate = as.Date(han_leader2$artDate)
han_leader2= han_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader2time = han_leader2 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="Aptantion" )
han_leader3 = han %>% filter(artPoster=="shared")
## artTitle artDate artTime
## Length:36 Min. :2019-01-18 Length:36
## Class :character 1st Qu.:2019-02-15 Class1:hms
## Mode :character Median :2019-04-17 Class2:difftime
## Mean :2019-04-03 Mode :numeric
## 3rd Qu.:2019-05-07
## Max. :2019-05-30
## artUrl artPoster artCat commentNum
## Length:36 Length:36 Length:36 Min. : 18.0
## Class :character Class :character Class :character 1st Qu.: 56.0
## Mode :character Mode :character Mode :character Median : 114.0
## Mean : 282.4
## 3rd Qu.: 321.0
## Max. :1479.0
## push boo sentence
## Min. : 5.00 Min. : 1.00 Length:36
## 1st Qu.: 16.75 1st Qu.: 12.00 Class :character
## Median : 50.50 Median : 28.00 Mode :character
## Mean :135.03 Mean : 44.86
## 3rd Qu.:162.25 3rd Qu.: 56.50
## Max. :759.00 Max. :191.00
han_leader3$artDate = as.Date(han_leader3$artDate)
han_leader3= han_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
han_leader3time = han_leader3 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="shared" )
# 整合他們的發文趨勢圖
han_leader = rbind(han_leader1time,han_leader2time,han_leader3time)
han_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(han_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(han_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
han_3 = han %>% filter(artPoster=="linhu8883324"|artPoster=="Aptantion"|artPoster=="shared")
devotion_bigram <- han_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
# 把評論和文章依據artUrl innerJoin起來
zhu_all <- merge(x = zhu, y = zhu1, by = "artUrl")
allPoster <- c(zhu$artPoster, zhu1$commentPoster)
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%zhu$artPoster, "poster", "replyer"))
length(unique(zhu$artPoster))# 發文者數量340
## 527 111
zhu_poster=table(zhu$artPoster) %>% sort %>% as.data.frame
zhu_poster=zhu_poster %>% filter(freq>=10)
zhu_link <- zhu_all %>%
filter(commentNum >=100) %>% #回應數大於100則
filter(artPoster.x==zhu_poster$artPoster) %>% #發文次數>10次
# filter(commentStatus!="→") %>% # ptt篩出推噓
select(commentPoster, artPoster.x, artUrl, commentStatus)
# 這邊要篩選link中有出現的使用者(否則沒有在link中出現的使用者也會被igraph畫上去,沒有意義)
zhu_filtered_user <- userList %>%
filter(user%in%zhu_link$commentPoster | user%in%zhu_link$artPoster.x) %>% arrange(desc(type))
# # 建立網路關係
# set.seed(487)
# net <- graph_from_data_frame(d=zhu_link, v=zhu_filtered_user, directed=F)
# # DEGREE大於10 將印出LABEL否則則無
# labels <- degree(net)
# V(net)$label <- names(labels)
# V(net)$color <- ifelse(V(net)$type=="poster", "gold", "lightblue")
# # 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
# E(net)$color <- ifelse(E(net)$commentStatus == "推", "lightgreen", "palevioletred")
# plot(net, vertex.size=2, edge.arrow.size=.2,
# vertex.label=ifelse(degree(net) > 10, V(net)$label, NA), vertex.label.ces=.5)
# # 加入標示
# legend(x=-2, y=-0.2, c("發文者","回文者"), pch=21,
# col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=2,
# text.width=0.02,x.intersp=0.7,adj=1,y.intersp=0.1,bty="n")
# legend(x=-2., y=1, c("推","噓"),
# col=c("lightgreen","palevioletred"), lty=1, cex=2,
# text.width=0.02,x.intersp=0.7,adj=1,y.intersp=0.1,bty="n")
## artTitle artDate artTime
## Length:27 Min. :2019-04-01 Length:27
## Class :character 1st Qu.:2019-04-12 Class1:hms
## Mode :character Median :2019-04-20 Class2:difftime
## Mean :2019-04-24 Mode :numeric
## 3rd Qu.:2019-05-03
## Max. :2019-05-31
## artUrl artPoster artCat commentNum
## Length:27 Length:27 Length:27 Min. : 10.00
## Class :character Class :character Class :character 1st Qu.: 20.50
## Mode :character Mode :character Mode :character Median : 26.00
## Mean : 36.33
## 3rd Qu.: 38.00
## Max. :197.00
## push boo sentence
## Min. : 0.00 Min. : 2.00 Length:27
## 1st Qu.: 3.50 1st Qu.: 6.00 Class :character
## Median : 6.00 Median :11.00 Mode :character
## Mean : 10.22 Mean :15.19
## 3rd Qu.: 7.00 3rd Qu.:16.00
## Max. :116.00 Max. :54.00
zhu_leader1$artDate = as.Date(zhu_leader1$artDate)
zhu_leader1= zhu_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
zhu_leader1time = zhu_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="MayorKoWenJe" )
# 二號(這五個月發了13次文)
zhu_leader2 = zhu %>% filter(artPoster=="Whitening")
## artTitle artDate artTime
## Length:13 Min. :2019-01-03 Length:13
## Class :character 1st Qu.:2019-01-06 Class1:hms
## Mode :character Median :2019-01-20 Class2:difftime
## Mean :2019-01-23 Mode :numeric
## 3rd Qu.:2019-02-05
## Max. :2019-02-26
## artUrl artPoster artCat commentNum
## Length:13 Length:13 Length:13 Min. : 9.0
## Class :character Class :character Class :character 1st Qu.: 38.0
## Mode :character Mode :character Mode :character Median : 81.0
## Mean :101.1
## 3rd Qu.:118.0
## Max. :284.0
## push boo sentence
## Min. : 4.00 Min. : 1.00 Length:13
## 1st Qu.: 10.00 1st Qu.: 2.00 Class :character
## Median : 16.00 Median : 28.00 Mode :character
## Mean : 28.38 Mean : 35.69
## 3rd Qu.: 29.00 3rd Qu.: 45.00
## Max. :144.00 Max. :168.00
zhu_leader2$artDate = as.Date(zhu_leader2$artDate)
zhu_leader2= zhu_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
zhu_leader2time = zhu_leader2 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="Whitening" )
zhu_leader3 = zhu %>% filter(artPoster=="MoriiKaho")
## artTitle artDate artTime
## Length:37 Min. :2019-01-17 Length:37
## Class :character 1st Qu.:2019-02-02 Class1:hms
## Mode :character Median :2019-02-25 Class2:difftime
## Mean :2019-02-23 Mode :numeric
## 3rd Qu.:2019-03-13
## Max. :2019-04-26
## artUrl artPoster artCat commentNum
## Length:37 Length:37 Length:37 Min. : 4.00
## Class :character Class :character Class :character 1st Qu.: 17.00
## Mode :character Mode :character Mode :character Median : 29.00
## Mean : 59.54
## 3rd Qu.: 57.00
## Max. :314.00
## push boo sentence
## Min. : 0.00 Min. : 0.00 Length:37
## 1st Qu.: 5.00 1st Qu.: 3.00 Class :character
## Median : 9.00 Median : 6.00 Mode :character
## Mean : 29.62 Mean :10.84
## 3rd Qu.: 19.00 3rd Qu.:10.00
## Max. :193.00 Max. :60.00
zhu_leader3$artDate = as.Date(zhu_leader3$artDate)
zhu_leader3= zhu_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
zhu_leader3time = zhu_leader3 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="MoriiKaho" )
# 整合他們的發文趨勢圖
zhu_leader = rbind(zhu_leader1time,zhu_leader2time,zhu_leader3time)
zhu_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(zhu_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(zhu_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(zhu_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(zhu_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(zhu_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(zhu_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_2.txt", stop_word = "dict/stop_word_2.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
zhu_3 = zhu %>% filter(artPoster=="MayorKoWenJe"|artPoster=="Whitening"|artPoster=="MoriiKaho")
devotion_bigram <- zhu_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
三名意見領袖:MayorKoWenJe、MoriiKaho 、Whitening
MoriiKaho主要是轉貼朱立倫的新聞,所以也是正面情緒偏多 Whitening的貼文,他都是有關兩岸政策的文章,但是原本預期會有較多負面詞,但是實際上分數並沒有偏向負面。
# 選出需要的欄位
kuo_r <- kuo_r[,c(4,7,8,10)]
colnames(kuo_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 發文者數量 1165
# 把評論和文章依據artUrl innerJoin起來
kuo_all <- merge(x = kuo, y = kuo_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- kuo_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
## 1699 345
kuo_poster=table(kuo$artPoster) %>% sort %>% as.data.frame
kuo_poster=kuo_poster %>% filter(freq>=5)
link <- kuo_all %>%
filter(commentNum >=100) %>% #回應數大於100則
filter(artPoster==kuo_poster$artPoster) %>% #發文次數>=5次
filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
kuo_leader1$artDate = as.Date(kuo_leader1$artDate)
kuo_leader1= kuo_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
kuo_leader1time = kuo_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="raugeon" )
# 二號(這2個月發了7次文)
kuo_leader2 = kuo %>% filter(artPoster=="toshbio")
## artTitle artDate artTime
## Length:7 Length:7 Length:7
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:7 Length:7 Length:7 Min. : 36.0
## Class :character Class :character Class :character 1st Qu.: 49.0
## Mode :character Mode :character Mode :character Median :110.0
## Mean :232.3
## 3rd Qu.:272.5
## Max. :837.0
## push boo sentence
## Min. : 14.0 Min. :11.00 Length:7
## 1st Qu.: 17.5 1st Qu.:12.50 Class :character
## Median : 49.0 Median :17.00 Mode :character
## Mean :149.6 Mean :19.86
## 3rd Qu.:169.5 3rd Qu.:21.00
## Max. :610.0 Max. :44.00
kuo_leader2$artDate = as.Date(kuo_leader2$artDate)
kuo_leader2= kuo_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
kuo_leader2time = kuo_leader2 %>%group_by(months) %>%
summarise( num=n()) %>%
mutate( poster ="toshbio" )
kuo_leader3 = kuo %>% filter(artPoster=="aventardorsv")
## artTitle artDate artTime
## Length:6 Length:6 Length:6
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:6 Length:6 Length:6 Min. : 11.00
## Class :character Class :character Class :character 1st Qu.: 23.75
## Mode :character Mode :character Mode :character Median :124.00
## Mean :217.00
## 3rd Qu.:264.00
## Max. :742.00
## push boo sentence
## Min. : 4.0 Min. : 4.00 Length:6
## 1st Qu.: 9.5 1st Qu.: 5.25 Class :character
## Median : 59.5 Median :23.00 Mode :character
## Mean :103.3 Mean :31.50
## 3rd Qu.:101.2 3rd Qu.:56.50
## Max. :389.0 Max. :72.00
kuo_leader3$artDate = as.Date(kuo_leader3$artDate)
kuo_leader3= kuo_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
kuo_leader3time = kuo_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
mutate( poster ="aventardorsv" )
# 整合他們的發文趨勢圖
kuo_leader = rbind(kuo_leader1time,kuo_leader2time,kuo_leader3time)
kuo_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kuo_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kuo_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kuo_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kuo_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kuo_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kuo_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
toshbio 則是以分享臉書、媒體的方式來傳達有關郭董的事情,而且有在講民主、和平。
aventardorsv 則是偏向鄉民的內容(廢話不多說,有圖有真相之類的用語)
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
kuo_3 = kuo %>% filter(artPoster=="raugeon"|artPoster=="toshbio"|artPoster=="aventardorsv")
devotion_bigram <- kuo_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
三名意見領袖: raugeon 、toshbio、 aventardorsv
# 選出需要的欄位
kp_r <- kp_r[,c(4,7,8,10)]
colnames(kp_r)=c("artUrl", "cmtPoster", "cmtStatus"," cmtContent")
# 把評論和文章依據artUrl innerJoin起來
kp_all <- merge(x = kp, y = kp_r, by = "artUrl")
# 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位
link <- kp_all %>%
dplyr::select(cmtPoster, artPoster, artUrl)
# 這個順序是因為graph_from_data_frame 有規定(若有方向)第一個欄位是from 第二個欄位是to, 後面的欄位就是描述這個關係的東西
# 建立網路關係
net <- graph_from_data_frame(d=link, directed=T)
## 6846 300
kp_poster=table(kp$artPoster) %>% sort %>% as.data.frame
kp_poster=kp_poster %>% filter(freq>=10)
link <- kp_all %>%
filter(commentNum >=500) %>% #回應數大於500則
filter(artPoster==kp_poster$artPoster) %>% #發文次數>10次
filter(cmtStatus!="→") %>% # ptt篩出推噓
select(cmtPoster, artPoster, artUrl, cmtStatus)
kp_leader1$artDate = as.Date(kp_leader1$artDate)
kp_leader1= kp_leader1 %>% mutate(months = as.Date(cut(artDate, "months")))
kp_leader1time = kp_leader1 %>%group_by(months) %>%
summarise(num=n()) %>% as.data.frame %>%
mutate( poster ="jk182325" )
# 二號(這五個月發了54次文)
kp_leader2 = kp %>% filter(artPoster=="thnlkj0665")
## artTitle artDate artTime
## Length:54 Length:54 Length:54
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:54 Length:54 Length:54 Min. : 31.0
## Class :character Class :character Class :character 1st Qu.: 97.5
## Mode :character Mode :character Mode :character Median : 210.5
## Mean : 263.5
## 3rd Qu.: 349.2
## Max. :1270.0
## push boo sentence
## Min. : 15.00 Min. : 2.00 Length:54
## 1st Qu.: 46.75 1st Qu.: 21.25 Class :character
## Median :118.00 Median : 32.50 Mode :character
## Mean :163.26 Mean : 40.91
## 3rd Qu.:226.75 3rd Qu.: 46.75
## Max. :640.00 Max. :220.00
kp_leader2$artDate = as.Date(kp_leader2$artDate)
kp_leader2= kp_leader2 %>% mutate(months = as.Date(cut(artDate, "months")))
kp_leader2time = kp_leader2 %>%group_by(months) %>%
summarise( num=n()) %>%
mutate( poster ="thnlkj0665" )
kp_leader3 = kp %>% filter(artPoster=="TSMConduty")
## artTitle artDate artTime
## Length:78 Length:78 Length:78
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## artUrl artPoster artCat commentNum
## Length:78 Length:78 Length:78 Min. : 4.0
## Class :character Class :character Class :character 1st Qu.: 32.5
## Mode :character Mode :character Mode :character Median : 67.5
## Mean : 164.5
## 3rd Qu.: 183.8
## Max. :1454.0
## push boo sentence
## Min. : 1.00 Min. : 0.00 Length:78
## 1st Qu.: 12.25 1st Qu.: 8.00 Class :character
## Median : 29.50 Median : 15.00 Mode :character
## Mean : 90.91 Mean : 29.46
## 3rd Qu.: 93.75 3rd Qu.: 35.00
## Max. :711.00 Max. :154.00
kp_leader3$artDate = as.Date(kp_leader3$artDate)
kp_leader3= kp_leader3 %>% mutate(months = as.Date(cut(artDate, "months")))
kp_leader3time = kp_leader3 %>%group_by(months) %>%
summarise(num=n()) %>%
mutate( poster ="TSMConduty" )
# 整合他們的發文趨勢圖
kp_leader = rbind(kp_leader1time,kp_leader2time,kp_leader3time)
kp_leader %>% ggplot(aes(x= months,y=num,fill=poster)) +geom_bar(stat = "identity")+
facet_wrap(~poster, ncol = 2, scales = "free")
# 一號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kp_leader1$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kp_leader1$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 二號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kp_leader2$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kp_leader2$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# 三號
# 先做斷句(以全形或半形驚歎號、問號、分號以及句號爲依據進行斷句)
devotion_sentences <- strsplit(kp_leader3$sentence,"[。!;?!?;]")
# 將每句句子,與所屬的文章連結配對起來,整理成一個dataframe
devotion_sentences <- data.frame(
id = rep(kp_leader3$artUrl,sapply(devotion_sentences, length)),
sentence = unlist(devotion_sentences)) %>%
filter(!str_detect(sentence, regex("^(\t|\n| )*$")))
devotion_sentences$sentence <- as.character(devotion_sentences$sentence)
# 使用斷詞引擎,放入要用的詞典和停用字
jieba_tokenizer = worker(user="dict/use_dict_3.txt", stop_word = "dict/stop_word_3.txt",write = "NOFILE")
chi_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
# 進行斷詞,並計算各詞彙在各文章中出現的次數
devotion_words <- devotion_sentences %>%
unnest_tokens(word, sentence, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(id, word, sort = TRUE)
# unnest_tokens 使用的bigram分詞函數,並執行bigram分詞
jieba_bigram <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(unlist(tokens), 2)
bigram <- lapply(bigram, paste, collapse = " ")
kp_3 = kp %>% filter(artPoster=="jk182325"|artPoster=="thnlkj0665"|artPoster=="TSMConduty")
devotion_bigram <- kp_3 %>%
unnest_tokens(bigram,sentence, token = jieba_bigram)
三名意見領袖:jk182325、thnlkj0665、TSMConduty jk182325 雖然適用分享新聞的方式,但其分享的新聞情緒字眼都蠻多的,所以起伏非常大
thnlkj0665 的情緒又更加明顯,但以正面居多,推估是因為其大多分享柯文哲的臉書文章,也因此用詞都會較為正面(候選人粉專大多不會有過度負面的情緒字眼)。
領導人 ,0.421120285987854
下任總統 ,0.3970821797847748
民選總統 ,0.39381736516952515
520 ,0.3750308156013489
國家元首 ,0.36245524883270264
蔡賴配 ,0.35755008459091187
總統大選 ,0.3544211685657501
脫黨 ,0.35220929980278015模型判定:
有「領導人」、「下任總統」等近似詞, 其中出現「蔡賴配」與「脫黨」等字詞,
顯示網友常提及蔡賴檔,或集中討論賴、 蔡或韓國瑜脫黨參選的可能。
「郭董」、「董事長」、「總裁」、 「企業家」、「參選者」,和其身為鴻 海董事長兼總統參選人的身分有關。 「離家出走」、「股價」、「關公」和 選舉動態相關。
韓國瑜的相似詞和其宣傳的個人特質較 為相關,如「八字」以民俗傳統渲染, 同時也出現「高雄發大財」、「Ido」、 等政治宣言,同時「夫人」、「陳其邁」 也是積極討論的對象。
「蘇貞昌」、「行政院長」、「前閣」,因曾任行政院院長;「賴神」,個人風格 相關;「君子之爭」、「勸退」、「辜寬敏」、「吳澧培」、「張善政」,黨內初 選活動相關
「馬英九」、「陳水扁」因當過總統, 故常與之一起比較,而「強勢」、「辣 台妹」、「堅定」而顯現其政治態度。 最後「瓦卡」、「辜寬敏」、「張葉森」 則近期評論總統大選的重要人士。
其中經過比較後發現,相似字詞大約可以分為兩類: 第一類和候選人本身特質相關,可以其看出其在網民心中大略的政治形象。 如郭台銘就有「郭董」、「總裁」、「企業家」等,可看出選民對其與鴻海董事長的企業家形象印象較為深刻。
賴清德 vs 總統 Cosine 相似度: 0.29456112
郭台銘 vs 總統 Cosine 相似度: 0.27846256
朱立倫 vs 總統 Cosine 相似度: 0.25331753
蔡英文 vs 總統 Cosine 相似度: 0.24053028
柯文哲 vs 總統 Cosine 相似度: 0.17790192
韓國瑜 vs 總統 Cosine 相似度: 0.16188833
🗿 意見領袖形象塑造
■ 賴清德:分享民調
■ 蔡英文:臉書傳聲筒
■ 柯文哲:爭議性話題
■ 韓國瑜:新聞與黑特的己見
■ 朱立倫:政策與兩岸議題
■ 郭台銘:抨擊參選人
💡 社群風向分析
1. 他們帶起的風向的是正面還是負面形象?
2. 成功與否?
3. 彼此之間有什麼差異?
4. 我們探討出來的相似詞彙?
5. 這次網路聲量上能夠占一席之地的參選人會是誰?
早在1944年,Lazarsfeld et al.(1944) 與研究團隊於當年美國總統大選,嘗試找出媒體輿論如何影響個人的投票決定,意外發現比起大眾傳播的方式,人際之間的傳播較有可能改變閱聽者的態度;現今的新網路媒體-INSIDE 硬塞的網路趨勢觀察也指出政治人物在新聞或節目上所說的話,首先影響的是傳統選民,但若要發揮更大的作用散播至網路上,仍得靠「仲介者」。可以看出:不論是以往或是當今,我們都可以發現意見領袖對於選戰而言,是舉足情重的重要角色。
Park, C. S. (2013) 表示推特的意見領導會導致他人政治參與度增加,因此我們推測台灣的論壇PTT也有可能會具備相似的效果,不僅能引起討論熱度,也能帶動政治參與。
Lazarsfeld, P. F., Berelson, B., & Gaudet, H. (1944). The people’s choice. How the voter makes up his mind in a presidential campaign. New York, NY: Columbia University Press.
Park, C. S. (2013). Does Twitter motivate involvement in politics? Tweeting, opinion leadership, and political engagement. Computers in Human Behavior, 29(4), 1641-1648.
吳學展(2015)。【解構 PTT】有自己的法律、法院和貨幣,沒被臉書淘汰的 PTT 是台灣民主化的重要推手。橘報。取自https://buzzorange.com/