ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • [R] wordcloud2 패키지 실습
    프로그래밍/R 2021. 6. 7. 22:24

    본 내용은 '2021년 혁신성장 청년인재 집중양성 사업'의 ‘인공지능 개발자 양성 과정’ 강좌를 수강하면서 강의 및 강의노트를 참고하여 작성한 내용입니다.


    WordCloud2 패키지

    • 장점
      • RStduio에서 버그가 나지 않는다
      • 자동으로 색상 입혀준다
      • 단어를 다양한 방법으로 배치시켜준다
      • wordcloud보다 너무 간단, 쉽다

    텍스트 마이닝wordcloud2.pptx

    WordCloud2 실습 예제

    # 02 DTM 구축 
    library(RCurl)
    library(XML)
    
    t = readLines('https://en.wikipedia.org/wiki/Data_science')
    d = htmlParse(t, asText = TRUE)
    clean_doc = xpathSApply(d,"//p", xmlValue)
    
    library(tm)
    library(SnowballC)
    
    doc = Corpus(VectorSource(clean_doc))
    inspect(doc)
    
    doc = tm_map(doc, content_transformer(tolower))
    doc = tm_map(doc, removeNumbers)
    doc = tm_map(doc, removeWords, stopwords('english'))
    doc = tm_map(doc, removePunctuation)
    doc = tm_map(doc, stripWhitespace)
    
    dtm = DocumentTermMatrix(doc)
    dim(dtm)
    
    inspect(dtm)
    
    # 03 단어 구름 #
    library(wordcloud)
    
    m = as.matrix(dtm)
    v = sort(colSums(m), decreasing = TRUE)
    d = data.frame(word = names(v), freq = v)
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0.35)
    
    library(RColorBrewer)
    
    pal = brewer.pal(11,"Spectral")
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.50, colors = pal)
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.50, colors = pal, family = "mono", font = 2)
    
    library(wordcloud2)
    wordcloud2(d)
    
    d1 = d[1:200, ]        # 200개 단어만 표시
    wordcloud2(d1, shape = 'star')
    wordcloud2(d1, minRotation = pi/4, maxRotation = pi/4, rotateRatio = 1.0)
    
    findFreqTerms(dtm, lowfreq = 12)
    
    findAssocs(dtm, terms = 'harvard', corlimit = 0.7)
    
    barplot(d[1:10, ]$freq, las = 2, names.arg = d[1:10,]$word, col = 'lightblue', main = '발생 빈도 상위 단어', ylab = '단어 빈도')
    
    library(gapminder)
    library(dplyr)
    
    pop_siz = gapminder%>%filter(year==2007)%>%group_by(continent)%>%summarize(sum(as.numeric(pop)))
    d = data.frame(word = pop_siz[, 1], freq = pop_siz[, 2])
    wordcloud(words = d[, 1], freq = d[, 2], min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0.35)
    wordcloud2(d)
    
    # 04 문서 분류 #
    
    library(text2vec)
    library(caret)
    
    str(movie_review)
    
    head(movie_review)
    
    # 데이터를 훈련 집합(mtrain)과 테스트 집합(mtest)로 나눔
    train_list = createDataPartition(y= movie_review$sentiment, p = 0.6, list =  FALSE)
    mtrain = movie_review[train_list, ]
    mtest = movie_review[-train_list, ]
    
    # 훈련 집합으로 DTM 구축
    doc = Corpus(VectorSource(mtrain$review))
    doc = tm_map(doc, content_transformer(tolower))
    doc = tm_map(doc, removeNumbers)
    doc = tm_map(doc, removeWords, stopwords('english'))
    doc = tm_map(doc, removePunctuation)
    doc = tm_map(doc, stripWhitespace)
    dtm = DocumentTermMatrix(doc)
    dim(dtm)
    str(dim)
    inspect(dtm)
    
    dtm_small = removeSparseTerms(dtm, 0.90)
    X = as.matrix(dtm_small)
    dataTrain = as.data.frame(cbind(mtrain$sentiment, X))
    dataTrain$V1 = as.factor(dataTrain$V1)
    colnames(dataTrain)[1] = 'y'
    
    library(rpart)
    r = rpart(y~., data = dataTrain)
    printcp(r)
    par(mfrow = c(1, 1), xpd = NA)
    plot(r)
    text(r, use.n = TRUE)
    
    library(randomForest)
    f = randomForest(y~., data = dataTrain)
    
    # 훈련 집합으로 DTM 구축
    docTest = Corpus(VectorSource(mtest$review))
    docTest = tm_map(docTest, content_transformer(tolower))
    docTest = tm_map(docTest, removeNumbers)
    docTest = tm_map(docTest, removeWords, stopwords('english'))
    docTest = tm_map(docTest, removePunctuation)
    docTest = tm_map(docTest, stripWhitespace)
    
    dtmTest = DocumentTermMatrix(docTest, control=list(dictionary=dtm_small$dimnames$Terms))
    
    dim(dtmTest)
    str(dtmTest)
    inspect(dtmTest)
    
    X = as.matrix(dtmTest)
    dataTest = as.data.frame(cbind(mtest$sentiment, X))
    dataTest$V1 = as.factor(dataTest$V1)
    colnames(dataTest)[1] = 'y'
    pr = predict(r, newdata = dataTest, type = 'class')
    table(pr, dataTest$y)
    
    pf = predict(f, newdata = dataTest)
    table(pf, dataTest$y)
    
    # 05 영어 텍스트 마이닝을 이용한 한국어 처리 #
    
    library(tm)
    library(XML)
    library(wordcloud2)
    library(SnowballC)
    library(RCurl)
    t = readLines('https://ko.wikipedia.org/wiki/%EB%B9%85_%EB%8D%B0%EC%9D%B4%ED%84%B0')
    d = htmlParse(t, asText = TRUE)
    clean_doc = xpathSApply(d, "//p", xmlValue)
    
    doc = Corpus(VectorSource(clean_doc))
    inspect(doc)
    
    doc = tm_map(doc, content_transformer(tolower))
    doc = tm_map(doc, removeNumbers)
    doc = tm_map(doc, removePunctuation)
    doc = tm_map(doc, stripWhitespace)
    
    dtm = DocumentTermMatrix(doc)
    dim(dtm)
    inspect(dtm)
    
    m = as.matrix(dtm)
    v = sort(colSums(m), decreasing = TRUE)
    d = data.frame(word = names(v), freq = v)
    d1 = d[1:500, ]                # 500개 단어만 표시
    wordcloud2(d1)
    
    # 06 KoNLP를 이용한 한국어 텍스트 마이닝 #
    
    install.packages('KoNLP')
    library(KoNLP)
    useSystemDic()
    useSejongDic()
    useNIADic()
    
    useSejongDic()
    s='너에게 묻는다 연탄재 함부로 발로 차지 마라 너는 누구에게 한번이라도 뜨거운 사람이었느냐'
    extractNoun(s)
    
    SimplePoss22(s)
    
    t=readLines('https://ko.wikipedia.org/wiki/%EB%B9%85_%EB%8D%B0%EC%9D%B4%ED%84%B0')
    d=htmlParse(t, asText=TRUE)
    clean_doc=xpathSApply(d, "//p", xmlValue)
    
    useSejongDic()
    
    nouns = extracNoun(clean_doc)
    mnous = unlist(nouns)
    mnous_freq = table(mnous)
    v = sort(mnous_freq, decreasing = TRUE)
    
    wordcloud2(v)             # 모든 단어 표시 : [그림 11-14(a)]
    v1 = v[1:100]              # 상위 100개 단어만 골라 표시 : [그림 11-14(b)]
    wordcloud2(v1)
    
    # [더 알아보기] 트위터 API를 이용한 말뭉치 얻기 #
    library(twitteR)
    # 아래 임의의 키 값 대신 사용자가 실제로 생성한 4개의 키를 입력!
    consumerKey = "twEhTjcTtzJSUYqnewJp8fQgm"
    consumerSecret = "nJIcd0ldPlZiTpUey8oLFGPxyjkDReZl5aSkPVvPtXfwUhowXS"
    accessToken = "67037524gWk67HzUfJnSoYJbZAuyqak62SxzfnYp4X2pWzNG6"
    accessTokenSecret = "nWmXO2UT6pm6I0ZrizBwE94bOrdUJIsrt3pBCBSKbI4B0"
    
    setup_twitter_oauth(consumerKey, consumerSecret, accessToken, accessTokenSecret)
    keyword <- enc2utf8("미세먼지")
    twitdata <- searchTwitter(keyword, n=100, lang="ko")
    twitdata_df <- twListToDF(twitdata)
    twitdata_text = twitdata_df$text
    twitdata_text

    WordCloud2 실습 예제

    # 03 단어 구름 #
    library(wordcloud)
    
    m = as.matrix(dtm)
    v = sort(colSums(m), decreasing = TRUE)
    d = data.frame(word = names(v), freq = v)
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0.35)
    
    library(RColorBrewer)
    
    pal = brewer.pal(11,"Spectral")
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.50, colors = pal)
    wordcloud(words = d$word, freq = d$freq, min.freq = 1, max.words = 50, random.order = FALSE, rot.per = 0.50, colors = pal, family = "mono", font = 2)
    
    # 데이터프레임을 만들어 집어넣기만 하면 wordcloud가 만들어진다!!
    library(wordcloud2)
    wordcloud2(d)
    
    d1 = d[1:200, ]        # 200개 단어만 표시
    wordcloud2(d1, shape = 'star')
    wordcloud2(d1, minRotation = pi/4, maxRotation = pi/4, rotateRatio = 1.0)
    
    findFreqTerms(dtm, lowfreq = 12)
    
    findAssocs(dtm, terms = 'harvard', corlimit = 0.7)
    
    barplot(d[1:10, ]$freq, las = 2, names.arg = d[1:10,]$word, col = 'lightblue', main = '발생 빈도 상위 단어', ylab = '단어 빈도')
    
    #install.packages("gapminder")
    library(gapminder)
    library(dplyr)
    
    pop_siz = gapminder%>%filter(year==2007)%>%group_by(continent)%>%summarize(sum(as.numeric(pop)))
    d = data.frame(word = pop_siz[, 1], freq = pop_siz[, 2])
    wordcloud(words = d[, 1], freq = d[, 2], min.freq = 1, max.words = 100, random.order = FALSE, rot.per = 0.35)
    wordcloud2(d)

    실행 결과 中 일부

    댓글

Designed by Tistory.