資料科學與機器學習 k-means、PCA、SVM

Pecu PPT / Code :

https://goo.gl/KASM22

https://goo.gl/cAYb67

參考網站：https://deeplearning4j.org/word2vec

SVR : https://en.wikipedia.org/wiki/Support_vector_machine#Regression

如果你的問題可以用物理性質建構出來的，我們可以使用 simulation

用 PCA 降維，找出最具代表性的最佳投影向量或投影面

SVM 分類 -- R 中的 SVM package 為 e1071

一、安裝以下套件

Step 1. install.packages("devtools")

Step 2. library(devtools)

install_github("ggbiplot", "vqv")

## R 會自動從https://github.com/vqv/ggbiplot下載
## 上面這兩行一定要按順序執行

Step 3. install.packages("scales")

install.packages("e1071")

Step 4. library(scales)

library(grid) ←-grid 是 base package，不需要另外安裝

library(ggbiplot)

# 範例一：PCA.r

library(devtools)

install_github("ggbiplot", "vqv")

library(scales)

library(grid)

library(ggbiplot)

data(iris)

ir.pca <- prcomp(iris[,1:4])

ir.species <- iris[, 5]

g <- ggbiplot(ir.pca, obs.scale = 1, var.scale = 1,

groups = ir.species, ellipse = TRUE,

circle = TRUE)

g <- g + scale_color_discrete(name = '')

g <- g + theme(legend.direction = 'horizontal',

legend.position = 'top')

print(g)

成果圖：

**# 範例二：MLDM.r

如果沒有.txt 檔可以用，請先執行 main.R （會呼叫 pttTestFunction.R）

library(NLP)

library(tm)

library(tmcn)

Sys.setenv(JAVA_HOME="/Library/Java/JavaVirtualMachines/jdk1.8.0_101.jdk/Contents/Home") ###每個人的JAVA_HOME都不一樣唷，請參考20170503筆記

library(rJava)

library(SnowballC)

library(slam)

library(Matrix)

# import data

source("readFromTXT.R")

# corpus to tdm

d.corpus <- Corpus(VectorSource(seg))

tdm <- TermDocumentMatrix(d.corpus,

control = list(wordLengths = c(2, Inf)))

View(inspect(tdm[1:9, 1:10]))

ass = findAssocs(tdm, "", 0.75)

# tf-idf computation

N = tdm$ncol

tf <- apply(tdm, 2, sum)

idfCal <- function(word_doc)

{

log2( N / nnzero(word_doc) )

}

idf <- apply(tdm, 1, idfCal)

doc.tfidf <- as.matrix(tdm)

for(x in 1:nrow(tdm))

{

for(y in 1:ncol(tdm))

{

doc.tfidf[x,y] <- (doc.tfidf[x,y] / tf[y]) * idf[x]

}

result = findFreqTerms(tdm, 200)

library(plotly)

topID = lapply(rownames(as.data.frame(ass)), function(x)

which(rownames(tdm) == x))

topID = unlist(topID)

plot_ly(data = as.data.frame(doc.tfidf),

x = as.numeric(colnames(doc.tfidf)),

y = doc.tfidf[topID[10],],

name = rownames(doc.tfidf)[topID[10]],

type = "scatter", mode= "box") %>%

add_trace(y = doc.tfidf[topID[2],],

name = rownames(doc.tfidf)[topID[2]])

# get short doc matrix

nonzero = (doc.tfidf != rep(0,10))

nonzeroid = which(row_sums(nonzero) != 0)

q <- rownames(doc.tfidf[nonzeroid,])

all.term <- rownames(doc.tfidf)

loc <- which(all.term %in% q)

s.tdm <- doc.tfidf[loc,]

View(s.tdm)

# result : cos similarity ranking

cos.sim <- function(x, y)

{

(as.vector(x) %*% as.vector(y)) / (norm(as.matrix(x)) * norm(y))

}

doc.cos <- cos.sim(x=as.matrix(s.tdm[,1]),

y=as.matrix(s.tdm[,2]))

doc.cos <- apply(s.tdm[,2:10], 2, cos.sim,

y=as.matrix(s.tdm[,2]))

orderDoc <- doc.cos[order(doc.cos, decreasing = TRUE)]

plot_ly(data = as.data.frame(orderDoc),

x = rownames(as.data.frame(orderDoc)),

y = orderDoc,

name = rownames(doc.tfidf)[topID[10]],

type = "bar", mode= "box")

library(stats)

kmeansOut <- kmeans(doc.tfidf, 5, nstart = 20)

# 範例三：DLTM.r

source('MLDM.R')

testTfidf = doc.tfidf

tfidf.pca <- prcomp(testTfidf)

tfidf.kmeans <- as.factor(kmeansOut$cluster)

g <- ggbiplot(tfidf.pca, obs.scale = 1, var.scale = 1,

groups = tfidf.kmeans, ellipse = TRUE,

circle = TRUE, labels = rownames(testTfidf))

g <- g + scale_color_discrete(name = '')

g <- g + theme(legend.direction = 'horizontal',

legend.position = 'top')

print(g)

結果圖：

# 範例四：SVM.r

set.seed(10111)

x = matrix(rnorm(40), 20, 2)

y = rep(c(-1, 1), c(10, 10))

x[y == 1, ] = x[y == 1, ] + 1

plot(x, col = y + 3, pch = 19)

library(e1071)

dat = data.frame(x, y = as.factor(y))

svmfit = svm(y ~ ., data = dat,

kernel = "linear",

cost = 10, scale = FALSE)

print(svmfit)

plot(svmfit, dat)

make.grid = function(x, n = 75) {

grange = apply(x, 2, range)

x1 = seq(from = grange[1, 1], to = grange[2, 1], length = n)

x2 = seq(from = grange[1, 2], to = grange[2, 2], length = n)

expand.grid(X1 = x1, X2 = x2)

}

xgrid = make.grid(x)

ygrid = predict(svmfit, xgrid)

plot(xgrid, col = c("red", "blue")[as.numeric(ygrid)], pch = 20, cex = 0.2)

points(x, col = y + 3, pch = 19)

points(x[svmfit$index, ], pch = 5, cex = 2)

beta = drop(t(svmfit$coefs) %*% x[svmfit$index, ])

beta0 = svmfit$rho

plot(xgrid, col = c("red", "blue")[as.numeric(ygrid)], pch = 20, cex = 0.2)

points(x, col = y + 3, pch = 19)

points(x[svmfit$index, ], pch = 5, cex = 2)

abline(beta0/beta[2], -beta[1]/beta[2])

abline((beta0 - 1)/beta[2], -beta[1]/beta[2], lty = 2)

abline((beta0 + 1)/beta[2], -beta[1]/beta[2], lty = 2)

load(url("https://statistics.stanford.edu/~tibs/ElemStatLearn/datasets/ESL.mixture.rda")\)

names(ESL.mixture)

rm(x, y)

attach(ESL.mixture)

plot(x, col = y + 1)

dat = data.frame(y = factor(y), x)

fit = svm(factor(y) ~ ., data = dat, scale = FALSE, kernel = "radial", cost = 5)

xgrid = expand.grid(X1 = px1, X2 = px2)

ygrid = predict(fit, xgrid)

plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)

points(x, col = y + 1, pch = 19)

func = predict(fit, xgrid, decision.values = TRUE)

func = attributes(func)$decision

xgrid = expand.grid(X1 = px1, X2 = px2)

ygrid = predict(fit, xgrid)

plot(xgrid, col = as.numeric(ygrid), pch = 20, cex = 0.2)

points(x, col = y + 1, pch = 19)

contour(px1, px2, matrix(func, 69, 99), level = 0, add = TRUE)

contour(px1, px2, matrix(prob, 69, 99), level = 0.5, add = TRUE, col = "blue",

lwd = 2)

結果圖：

初探資料科學與機器學習

results matching ""

No results matching ""