中文字 Mining
Pecu PPT / Code :
https://ceiba.ntu.edu.tw/course/26c034/content/TextMining.pdf
https://ceiba.ntu.edu.tw/modules/index.php?csn=26c034&default_fun=syllabus¤t_lang=chinese
先安裝以下套件
tmcn : https://r-forge.r-project.org/R/?group_id=1571
install.packages("NLP")
install.packages("tm")
install.packages("jiebaRD")
install.packages("jiebaR")
install.packages("RColorBrewer")
install.packages("wordcloud")
install.packages("rvest")
安裝 tmcn 方法
由於已經下架,無法直接使用 install.packages("tmcn")
Mac / Linux
install.packages("http://download.r-forge.r-project.org/src/contrib/tmcn_0.1-4.tar.gz", repos = NULL, type = "source")
Windows
Step 1. 點選 Packages > Install
Step 2. 選擇 Install from Package Archive File (.tgz; tar.gz)
Step 3. 選擇 ceiba 上下載下來的 TextMining.zip 裡的 tmcn_0.1-4.zip
Step 4. 點選 Install 就可以囉!
範例一 --- 爬蟲文本蒐集
rm(list=ls(all.names = TRUE))
library(tmcn)
library(rvest)
URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"
html = read_html(URL)
title = html_nodes(html,"a")
href = html_attr(title,"href")
data = data.frame(title = toUTF8(html_text(title)),href = href)
範例二 --- 更深入看某一個網址的內容
rm(list=ls(all.names = TRUE))
library(tmcn)
library(rvest)
URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"
html = read_html(URL)
title = html_nodes(html,"a")
href = html_attr(title,"href")
data = data.frame(title = toUTF8(html_text(title)),href = href)
subURL = paste0("https://www.ptt.cc",data$href[12]\)
subhtml = read_html(subURL)
content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")
result = toUTF8(html_text(content))
範例三 --- 將範例二多寫的四行程式包裝成 function
rm(list=ls(all.names = TRUE))
library(tmcn)
library(rvest)
URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"
html = read_html(URL)
title = html_nodes(html,"a")
href = html_attr(title,"href")
data = data.frame(title = toUTF8(html_text(title)),href = href)
getContent <- function(href){
subURL = paste0("https://www.ptt.cc",href\)
subhtml = read_html(subURL)
content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")
return(toUTF8(html_text(content)))
}
創建 function 成功圖:
成功後在 Console 打 getContent(data$href[14]) 測試
測試結果圖:
**
**
範例四 --- 整理一下並將所得到的數據存成 txt 檔
rm(list=ls(all.names = TRUE))
library(tmcn)
library(rvest)
URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"
html = read_html(URL)
title = html_nodes(html,"a")
href = html_attr(title,"href")
data = data.frame(title = toUTF8(html_text(title)),href = href)
#去掉不需要的資訊
data = data[-c(1:10),]
getContent <- function(href){
subURL = paste0("https://www.ptt.cc",href\)
subhtml = read_html(subURL)
content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")
return(toUTF8(html_text(content)))
}
allText = sapply(data$href,getContent)
allText
write.table(allText,"mydata.txt") # 儲存成 .txt 檔
範例五 --- 呼叫另一個 R Script
pttTestFunction.R
library(tmcn)
library(rvest)
pttTestFunction <- function(URL, filename)
{
#URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"
html = read_html(URL)
title = html_nodes(html, "a")
href = html_attr(title, "href")
data = data.frame(title = toUTF8(html_text(title)),
href = href)
data = data[-c(1:10),]
getContent <- function(x) {
url = paste0("https://www.ptt.cc", x)
tag = html_node(read_html(url), 'div#main-content.bbs-screen.bbs-content')
text = toUTF8(html_text(tag))
}
#getContent(data$href[1])
allText = sapply(data$href, getContent)
allText
#out <- file(filename, "w", encoding="BIG-5")
write.table(allText, filename)
#close(out)
}
main.R
source('pttTestFunction.R')
id = c(1:10)
URL = paste0("https://www.ptt.cc/bbs/NTUcourse/index", id, ".html")
filename = paste0(id, ".txt")
pttTestFunction(URL[1], filename[1])
mapply(pttTestFunction,
URL = URL, filename = filename)
然後執行 main.R,如果執行不成功,請將 source('pttTestFunction.R') 裡面改成絕對路徑,或者是將兩個script 放在同一個 project 裡
課堂練習
搭配 Function / mapply / 自動生成檔案
文本清理
參考資料
https://cran.r- project.org/doc/contrib/de_Jonge+van_der_Loo-Introduction_to_data_cleaning_with_R.pdf大小寫轉換
標點符號、數字移除
URLs 移除
表情符號、停用詞移除
範例六 --- 從電腦讀檔(仰賴 NLP/tm 這兩個函式庫)
rm(list=ls(all.names = TRUE))
library(NLP)
library(tm)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
filenames <- list.files(getwd(), pattern="*.txt")
files <- lapply(filenames, readLines)
docs <- Corpus(VectorSource(files)) #Corpus是一種文字檔資料格式--文本
# 移除可能有問題的符號
# content_transformer 為內文取代 function
toSpace <- content_transformer(function(x, pattern) {
return (gsub(pattern, "", x))
}
)
# 複製文本中你不要的符號刪除,可寫多行刪除多種符號
docs <- tm_map(docs, toSpace, "文本中你不要的符號貼於此")
docs <- tm_map(docs, toSpace, "[a-zA-Z]")
# 移除標點符號 (punctuation)
# 移除數字跟空白 (digits / white space)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, stripWhitespace)
# 語詞詞幹化(stemmization)
# 以英文為例
#https://zh.wikipedia.org/wiki/词干提取
#library(SnowballC)
#確保任何形式的單字只會轉換成相同詞性出現一次
#docs <- tm_map(docs, stemDocument)
mixseg = worker()
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}
seg = lapply(docs, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
解決Mac 文字雲亂碼
同場加映進階做字間關聯解決辦法
par(family=("Heiti TC Light"))
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(5,0.1),min.freq=50,max.words=150,
random.order=TRUE, random.color=FALSE,
rot.per=.1, colors=brewer.pal(8, "Dark2"),
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE)
文字雲解說:
min.freq=50:最小頻率為50
max.words=150 : 最多150個字
random.order=TRUE:順序隨機
colors=brewer.pal(8, "Dark2") : library(RColorBrewer)提供的風格之一
另一種資料取代方法:
files[[1]] = gsub(“的”,””,files[[1]])
files[[1]] = gsub(“作者”,””,files[[1]])
文字雲結果圖:
斷詞處理
產生正確中文詞頻矩陣
- 使用 mixseg = worker() 產生切詞器
使用範例:
> library(jiebaRD)
> library(jiebaR)
> mixseg = worker()
> segment("新聞資料處理與視覺呈現是一門好課", mixseg)
[1] "新聞資料""處理""與""視覺""呈現""是""一門""好課"
- 使用 new_user_word 將新詞彙加入詞庫
參考網站:
用結巴斷詞實作文字視覺化 — 以 2016 總統就職演講為例
http://blog.infographics.tw/2016/05/text-visualization-with-jieba/
中文斷詞系統
http://ckipsvr.iis.sinica.edu.tw
- 使用 cutter=worker(“tag”) 可切割出詞彙與提供詞彙的詞性