中文字 Mining

Pecu PPT / Code :

https://ceiba.ntu.edu.tw/course/26c034/content/TextMining.pdf

https://ceiba.ntu.edu.tw/modules/index.php?csn=26c034&default_fun=syllabus&current_lang=chinese

先安裝以下套件

tmcn : https://r-forge.r-project.org/R/?group_id=1571

install.packages("NLP")

install.packages("tm")

install.packages("jiebaRD")

install.packages("jiebaR")

install.packages("RColorBrewer")

install.packages("wordcloud")

install.packages("rvest")

安裝 tmcn 方法

由於已經下架，無法直接使用 install.packages("tmcn")

Mac / Linux

install.packages("http://download.r-forge.r-project.org/src/contrib/tmcn_0.1-4.tar.gz", repos = NULL, type = "source")

Windows

Step 1. 點選 Packages > Install

Step 2. 選擇 Install from Package Archive File (.tgz; tar.gz)

Step 3. 選擇 ceiba 上下載下來的 TextMining.zip 裡的 tmcn_0.1-4.zip

Step 4. 點選 Install 就可以囉！

範例一 --- 爬蟲文本蒐集

rm(list=ls(all.names = TRUE))

library(tmcn)

library(rvest)

URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"

html = read_html(URL)

title = html_nodes(html,"a")

href = html_attr(title,"href")

data = data.frame(title = toUTF8(html_text(title)),href = href)

範例二 --- 更深入看某一個網址的內容

rm(list=ls(all.names = TRUE))

library(tmcn)

library(rvest)

URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"

html = read_html(URL)

title = html_nodes(html,"a")

href = html_attr(title,"href")

data = data.frame(title = toUTF8(html_text(title)),href = href)

subURL = paste0("https://www.ptt.cc",data$href[12]\)

subhtml = read_html(subURL)

content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")

result = toUTF8(html_text(content))

範例三 --- 將範例二多寫的四行程式包裝成 function

rm(list=ls(all.names = TRUE))

library(tmcn)

library(rvest)

URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"

html = read_html(URL)

title = html_nodes(html,"a")

href = html_attr(title,"href")

data = data.frame(title = toUTF8(html_text(title)),href = href)

getContent <- function(href){

subURL = paste0("https://www.ptt.cc",href\)

subhtml = read_html(subURL)

content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")

return(toUTF8(html_text(content)))

}

創建 function 成功圖：

成功後在 Console 打 getContent(data$href[14]) 測試

測試結果圖：

範例四 --- 整理一下並將所得到的數據存成 txt 檔

rm(list=ls(all.names = TRUE))

library(tmcn)

library(rvest)

URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"

html = read_html(URL)

title = html_nodes(html,"a")

href = html_attr(title,"href")

data = data.frame(title = toUTF8(html_text(title)),href = href)

#去掉不需要的資訊

data = data[-c(1:10),]

getContent <- function(href){

subURL = paste0("https://www.ptt.cc",href\)

subhtml = read_html(subURL)

content = html_nodes(subhtml,"div#main-content.bbs-screen.bbs-content")

return(toUTF8(html_text(content)))

}

allText = sapply(data$href,getContent)

allText

write.table(allText,"mydata.txt") # 儲存成 .txt 檔

範例五 --- 呼叫另一個 R Script

pttTestFunction.R

library(tmcn)

library(rvest)

pttTestFunction <- function(URL, filename)

{

#URL = "https://www.ptt.cc/bbs/NTUcourse/index.html"

html = read_html(URL)

title = html_nodes(html, "a")

href = html_attr(title, "href")

data = data.frame(title = toUTF8(html_text(title)),

href = href)

data = data[-c(1:10),]

getContent <- function(x) {

url = paste0("https://www.ptt.cc", x)

tag = html_node(read_html(url), 'div#main-content.bbs-screen.bbs-content')

text = toUTF8(html_text(tag))

}

#getContent(data$href[1])

allText = sapply(data$href, getContent)

allText

#out <- file(filename, "w", encoding="BIG-5")

write.table(allText, filename)

#close(out)

}

main.R

source('pttTestFunction.R')

id = c(1:10)

URL = paste0("https://www.ptt.cc/bbs/NTUcourse/index", id, ".html")

filename = paste0(id, ".txt")

pttTestFunction(URL[1], filename[1])

mapply(pttTestFunction,

URL = URL, filename = filename)

然後執行 main.R，如果執行不成功，請將 source('pttTestFunction.R') 裡面改成絕對路徑，或者是將兩個script 放在同一個 project 裡

課堂練習

搭配 Function / mapply / 自動生成檔案

文本清理

參考資料
https://cran.r- project.org/doc/contrib/de_Jonge+van_der_Loo-Introduction_to_data_cleaning_with_R.pdf
大小寫轉換
標點符號、數字移除
URLs 移除
表情符號、停用詞移除

範例六 --- 從電腦讀檔（仰賴 NLP/tm 這兩個函式庫）

rm(list=ls(all.names = TRUE))

library(NLP)

library(tm)

library(jiebaRD)

library(jiebaR)

library(RColorBrewer)

library(wordcloud)

filenames <- list.files(getwd(), pattern="*.txt")

files <- lapply(filenames, readLines)

docs <- Corpus(VectorSource(files)) #Corpus是一種文字檔資料格式--文本

# 移除可能有問題的符號

# content_transformer 為內文取代 function

toSpace <- content_transformer(function(x, pattern) {

return (gsub(pattern, "", x))

}

)

# 複製文本中你不要的符號刪除，可寫多行刪除多種符號

docs <- tm_map(docs, toSpace, "文本中你不要的符號貼於此")

docs <- tm_map(docs, toSpace, "[a-zA-Z]")

# 移除標點符號 (punctuation)

# 移除數字跟空白 (digits / white space)

docs <- tm_map(docs, removePunctuation)

docs <- tm_map(docs, removeNumbers)

docs <- tm_map(docs, stripWhitespace)

# 語詞詞幹化(stemmization)

# 以英文為例

#https://zh.wikipedia.org/wiki/词干提取

#library(SnowballC)

#確保任何形式的單字只會轉換成相同詞性出現一次

#docs <- tm_map(docs, stemDocument)

mixseg = worker()

jieba_tokenizer=function(d){

unlist(segment(d[[1]],mixseg))

}

seg = lapply(docs, jieba_tokenizer)

freqFrame = as.data.frame(table(unlist(seg)))

解決Mac 文字雲亂碼

同場加映進階做字間關聯解決辦法

par(family=("Heiti TC Light"))

wordcloud(freqFrame$Var1,freqFrame$Freq,

scale=c(5,0.1),min.freq=50,max.words=150,

random.order=TRUE, random.color=FALSE,

rot.per=.1, colors=brewer.pal(8, "Dark2"),

ordered.colors=FALSE,use.r.layout=FALSE,

fixed.asp=TRUE)

文字雲解說：

min.freq=50：最小頻率為50

max.words=150 : 最多150個字

random.order=TRUE：順序隨機

colors=brewer.pal(8, "Dark2") : library(RColorBrewer)提供的風格之一

另一種資料取代方法：

files[[1]] = gsub(“的”,””,files[[1]])

files[[1]] = gsub(“作者”,””,files[[1]])

文字雲結果圖：

斷詞處理

產生正確中文詞頻矩陣

使用 mixseg = worker() 產生切詞器

使用範例：

> library(jiebaRD)

> library(jiebaR)
> mixseg = worker()

> segment("新聞資料處理與視覺呈現是一門好課", mixseg)

[1] "新聞資料""處理""與""視覺""呈現""是""一門""好課"

使用 new_user_word 將新詞彙加入詞庫

參考網站：
用結巴斷詞實作文字視覺化 — 以 2016 總統就職演講為例

http://blog.infographics.tw/2016/05/text-visualization-with-jieba/

中文斷詞系統

http://ckipsvr.iis.sinica.edu.tw

使用 cutter=worker(“tag”) 可切割出詞彙與提供詞彙的詞性

文字雲範例

results matching ""

No results matching ""