Bag-of-Words-Meets-Bags-of-Popcorn

Bag of Words Meets Bags of Popcorn

kaggle上的项目: https://www.kaggle.com/c/word2vec-nlp-tutorial

深入了解情感分析,用了朴素贝叶斯来做。没有用sk-learn

思路过程
  1. 下载 Data 并读取
  2. 对文本作 html 标签处理
  3. 得到所有词语
  4. 去除高频词
  5. 文本词语变成词向量
  6. 训练模型,得到条件概率
  7. 对测试数据,Words2Vec 处理
  8. classifyNB() 得到分类结果
  9. to_csv() 保存

代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# -*- coding: utf-8 -*-
# !/usr/bin/env python
# https://www.kaggle.com/c/word2vec-nlp-tutorial
import numpy as np
import pandas as pd
import re
import math
import operator
from bs4 import BeautifulSoup #html标签处理
def creatVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return vocabSet
def bagOfWords2VecMN(vocabList, inputSet):
returnVet = [0] * len(vocabList)
for i in range(len(vocabList)):
if vocabList[i] in inputSet:
returnVet[i] += 1
return returnVet
# 训练算法:从词向量计算概率
# trainMatrix:文档矩阵(数字向量)
# trainCategory:文档对应的类别 0正常 1垃圾
def trainNB0(trainMatrix, trainCategory):
# 文档数量
numTrainDocs = len(trainMatrix)
# 词汇量大小
numWords = len(trainMatrix[0])
# 垃圾文档的概率
pAbusive = sum(trainCategory)/float(numTrainDocs)
# 创建一个全部为0的矩阵p0Num
# [1,2,3,4,0,0,1]即第一个词在垃圾文档中出现的次数1次,第二个出现2次
# p1Denom 所有的词出现的总数 即1+2+3+4+1=11
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 得到每个词在垃圾文档中出现的频率[1/11,2/11,3/11,4/11,0,0,1/11]
p1Vect = [math.log(x/p1Denom) for x in p1Num]
p0Vect = [math.log(x/p0Denom) for x in p0Num]
return p1Vect,p0Vect,pAbusive
def classifyNB(vec2Classify, p0Vect,p1Vect,pClass1):
p1VecSum = 0; p0VecSum = 0
for i in range(len(p0Vect)):
p1VecSum += vec2Classify[i] * p1Vect[i]
p0VecSum += vec2Classify[i] * p0Vect[i]
p1 = p1VecSum + math.log(pClass1)
p0 = p0VecSum + math.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
# 取出前2000个高频字符串 with and you for之类
def calcMostFreq(vocabList, fullText):
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
# iteritems 排序
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),\
reverse=True)
return sortedFreq[:2000]
def textParse(review):
# 去掉HTML 标签,拿到内容
wordList = BeautifulSoup(review).get_text()
# 用正则表达式取出符合规范的部分
wordList = re.sub("[^a-zA-Z]"," ", wordList)
# 小写化所有的词,并转成词list
[tok.lower().split() for tok in wordList if len(tok) > 2]
return wordList
def getDataSet():
trainingSet = pd.read_csv("labeledTrainData.tsv",header=0,delimiter="\t",quoting=3)
testSet = pd.read_csv("testData.tsv",header=0,delimiter="\t")
testSubmission = pd.read_csv("sampleSubmission.csv")
dataSet = pd.read_csv("unlabeledTrainData.tsv",header=0,delimiter="\t"c)
return trainingSet,testSet,testSubmission,dataSet
def bagsOfPopcorn(trainingSet,testSet):
docList = []; classList = []; fullText = []
for i in range(len(trainingSet['id'])):
wordList = textParse(trainingSet['review'][i])
docList.append(wordList)
fullText.extend(wordList)
classList.append(trainingSet['sentiment'][i])
vocabList = creatVocabList(docList)
top2000Words = calcMostFreq(vocabList, fullText)
for word in top2000Words:
if word[0] in vocabList:
vocabList.remove(word[0])
trainMat = []
for i in range(len(docList)):
trainMat.append(bagOfWords2VecMN(vocabList, docList[i]))
p1Vect,p0Vect,pAbusive = trainNB0(trainMat, classList)
return vocabList,p1Vect,p0Vect,pAbusive
def getResult(vocabList,dataSet,p1V,p0V,pAb):
Result = []
for i in range(len(dataSet)):
wordList = textParse(dataSet['review'][i])
testList = bagOfWords2VecMN(vocabList, wordList)
Result.append(classifyNB(testList,p1V,p0V,pAb))
return Result
def testModel(vocabList,testSet,p1V,p0V,pAb,testSubmission):
errorCount = 0; resultID = []
testResult = getResult(vocabList,testSet,p1V,p0V,pAb)
result = pd.DataFrame({"id":testSet['id'], "sentiment":testResult})
result.to_csv("result.csv",index=False)
for i in range(len(testResult)):
if testResult[i] != testSubmission['sentiment'][i]:
errorCount += 1
print('error rate is : ', float(errorCount)/len(testResult))
def getSentiment(vocabList,dataSet,p1V,p0V,pAb):
Result = getResult(vocabList,dataSet,p1V,p0V,pAb)
output = pd.DataFrame({"id":dataSet['id'], "sentiment":Result})
output.to_csv("result_test.csv",index=False)
# print(output.info())
# print(output[:3])
if __name__ == '__main__':
trainingSet,testSet,testSubmission,dataSet = getDataSet()
vocabList,p1V,p0V,pAb = bagsOfPopcorn(trainingSet,testSet)
testModel(vocabList,testSet,p1V,p0V,pAb,testSubmission)
getSentiment(vocabList,dataSet,p1V,p0V,pAb)

问题点

  1. 取 testSet[‘id’] 时,tolist() 后变成[‘“12311_10”‘,’”8348_2”‘,..]

    代码:testSet = pd.read_csv(“testData.tsv”,header=0,delimiter=”\t”,quoting=3)

    把 quoting=3 去掉就行了