繁体   English   中英

SVM用于R中的文本分类

[英]SVM for text classification in R

我正在使用SVM对我的文本进行分类,但实际上我并没有得到结果,而是得到了数字概率。

数据框(1:20训练集,21:50测试集)

更新:

     ou <- structure(list(text = structure(c(1L, 6L, 1L, 1L, 8L, 13L, 24L, 
5L, 11L, 12L, 33L, 36L, 20L, 25L, 4L, 19L, 9L, 29L, 22L, 3L, 
8L, 8L, 8L, 2L, 8L, 27L, 30L, 3L, 14L, 35L, 3L, 34L, 23L, 31L, 
22L, 6L, 6L, 7L, 17L, 3L, 8L, 32L, 18L, 15L, 21L, 26L, 3L, 16L, 
10L, 28L), .Label = c("access, access, access, access", "character(0)", 
"report", "report, access", "report, access, access", "report, access, access, access", 
"report, access, access, access, access, access, access", "report, access, access, access, access, access, access, access", 
"report, access, access, access, access, access, access, report", 
"report, access, access, access, access, access, report", "report, access, access, access, report", 
"report, access, access, access, report, access", "report, access, access, report, access, access, access, access, access, access", 
"report, data", "report, data, data", "report, data, data, data", 
"report, data, data, data, data", "report, data, data, data, data, data", 
"report, data, data, data, report, report, data, access,access", 
"report, data, data, report", "report, data, report", "report, report", 
"report, report, access, access, access", "report, report, access, access, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, access, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, access, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, data", "report, report, data, report", "report, report, report, data, report, report, data, data, report, data, data", 
"report, report, report, report", "report, report, report, report, data, report, report, data, report, data, report", 
"report, report, report, report, report, data, report, data, data", 
"report, report, report, report, report, report, report", "report, report, report, report, report, report, report, access, access, access", 
"report, report, report, report, report, report, report, report, data, data, report, access, report, report", 
"report, report, report, report, report, report, report, report, report, report, data, report, report, report, report, report, report, report,report"
), class = "factor"), value = structure(c(2L, 2L, 2L, 2L, 2L, 
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("", 
"Access", "Report/Data"), class = "factor")), .Names = c("text", 
"value"), class = "data.frame", row.names = c(NA, -50L))

使用的代码:

        library(RTextTools)

        doc_matrix <- create_matrix(ou$text, language="english", removeNumbers=TRUE, stemWords=TRUE, removeSparseTerms=.998)

        #container <- create_container(doc_matrix, ou$text, trainSize=1:20, testSize=21:50, virgin=FALSE)
        container <- create_container(doc_matrix, as.numeric(factor(ou$text)), trainSize=1:20, testSize=21:50, virgin=FALSE)

        #Training models
        SVM <- train_model(container,"SVM")
        MAXENT <- train_model(container,"MAXENT")
        BAGGING <- train_model(container,"BAGGING")
        TREE <- train_model(container,"TREE")

        #Classify data using trained models
        SVM_CLASSIFY <- classify_model(container, SVM)
        MAXENT_CLASSIFY <- classify_model(container, MAXENT)
        BAGGING_CLASSIFY <- classify_model(container, BAGGING)

        #Analytics

        analytics <- create_analytics(container,SVM_CLASSIFY)

        models <- train_models(container, algorithms=c("MAXENT","SVM"))
        results <- classify_models(container, models)
        analytics <- create_analytics(container, results)
        summary(analytics)
        SVM <- cross_validate(container, 5, "SVM")
        write.csv(analytics@document_summary, "DocumentSummary.csv")

预期结果:

          text                                                          value
     21 report, access, access, access, access, access, access, access       Access
     22 report, access, access, access, access, access, access, access       Access
     23 report, access, access, access, access, access, access, access       Access
     24 character(0)                                                          NA
     25 report, access, access, access, access, access, access, access       Access
     26 report, report, data                                             Report/Data
     27 report, report, report, report                                   Report/Data
     28 report                                                          Report/Data
     29 report, data                                                    Report/Data
     30 report, report, report, report, report, report, report, report,
         data, data, report, access, report, report                      Report/Data

概率为的结果:

>   MAXENTROPY_LABEL    MAXENTROPY_PROB SVM_LABEL   SVM_PROB    MANUAL_CODE CONSENSUS_CODE  CONSENSUS_AGREE CONSENSUS_INCORRECT PROBABILITY_CODE    PROBABILITY_INCORRECT
> 1 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 2 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 3 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 4 1   0.055555556 12  0.071384112 2   12  1   1   12  1
> 5 8   0.999999066 22  0.070090645 8   8   1   0   8   0
> 6 25  1   12  0.074126949 27  25  1   1   25  1
> 7 33  0.627904676 13  0.068572857 30  33  1   1   33  1
> 8 33  0.406792176 12  0.074592181 3   33  1   1   33  1
> 9 20  1   12  0.074507793 14  20  1   1   20  1

编辑1:如何获得标签名称而不是SVM标签编号。

我通常要做的是

ou <- cbind(ou$text, results)

并打印标签:

ou$value <- "NONE"
ou$value[results$SVM_LABEL=="1"]  <- "Access"
ou$value[results$SVM_LABEL=="-1"] <- "Report/Data"
ou 

(假设您在训练模型时使用了1和-1)

我知道这有点原始,但是很清楚而且可以正常使用

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM