公式
P(Y|X) = P(X|Y)P(Y)/P(X)
P(Y,X) = P(Y|X)P(X) = P(X|Y)P(Y)
P('属于某类' | '具有某特征')
只需要找到一些包含已知特征标签的样本,即可训练。
思想
贝叶斯的核心思想:高概率对应的类别。
如果 P(‘属于A类’| ‘具有某特征’) > P(‘属于B类’| ‘具有某特征’),那么(x,y)属于A类。
概念
- 分词:把一句话拆成更细粒度的词语来。
- 停用词(stop words):无助于我们分类的词。
- 词袋子模型(bag of words): 贝叶斯失去了词语之间的顺序信息。
- 朴素贝叶斯: 加上条件独立假设的贝叶斯方法
使用python进行文本分类
思路过程:
- 拆分文本: 对文本集合进行分词,去掉标点符号、数字、停用词等特征预处理
- 考虑所有词汇: 选取部分词汇集合
- 区分好训练集和测试集
- 对训练集 将文本向量转换成数字向量 [0,1,1,0,0]
- 训练算法:计算概率 [0,1/2,1/2,0,0]
- 对测试集 文本向量转换成数字向量
- P(X|Yi)P(Yi) 向量相乘,对比概率大小
- 分类结果与测试集对比,求出错误率
伪代码:
计算每个类别中的文档数目
对每篇训练文档
对每个类别
如果词条出现在文档中->增加该词条的计数
增加所有词条的数值
对每个类别
各词条的数目/所有词条数 ->条件概率
返回每个类别的条件概率(和各词条在某类中出现的概率)
代码分解
样本
def loadDataSet():
postingList = [['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'],
['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'],
['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'],
['stop', 'posting', 'stupid', 'worthless', 'garbage'],
['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'],
['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
分词
def testParse(bigString):
# 去掉单词、数字,字符串长度小于2
lisetOfToken = re.split('\\W*', bigString)
# 全部小写处理
return [tok.lower() for tok in lisetOfToken if len(tok) > 2]
取得所有词汇
def creatVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet | set(document)
return vocabSet
取出前三十个高频字符串,去掉 (with and you for之类)
#vocabList 所有词汇 fullText未去重过的词汇
def calcMostFreq(vocabList, fullText):
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
# iteritems 排序
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),\
reverse=True)
return sortedFreq[:30]
留存交叉验证: (随机取20个作为测试集 其他为训练集)
trainingSet = list(range(2 * minLen)); testSet = []
for i in range(20):
randIndex = int(np.random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
训练模型-词量模型:不计算重复词语出现的次数
def setOfWords2Vec(vocabList, inputSet):
returnVet = [1 if w in inputSet else 0 for w in vocabList]
return returnVet
训练模型-词袋模型:计算重复词语出现的次数
def bagOfWords2VecMN(vocabList, inputSet):
returnVet = [0] * len(vocabList)
for i in range(len(vocabList)):
if vocabList[i] in inputSet:
returnVet[i] += 1
return returnVet
训练算法
注:
- 想求函数的最大值时,可以使用该函数的自然对数来替换原函数进行求解
- 平滑处理:将所有词的出现数初始化为1,所有词条数初始化为2
# 训练算法:从词向量计算概率
# trainMatrix:文档矩阵(数字向量)
# trainCategory:文档对应的类别 0正常 1垃圾
def trainNB0(trainMatrix, trainCategory):
# 文档数量
numTrainDocs = len(trainMatrix)
# 词汇量大小
numWords = len(trainMatrix[0])
# 垃圾文档的概率
pAbusive = sum(trainCategory)/float(numTrainDocs)
# 创建一个全部为0的矩阵p0Num
# [1,2,3,4,0,0,1]即第一个词在垃圾文档中出现的次数1次,第二个出现2次
# p1Denom 所有的词出现的总数 即1+2+3+4+1=11
p0Num = np.ones(numWords); p1Num = np.ones(numWords)
p0Denom = 2.0; p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum(trainMatrix[i])
else:
p0Num += trainMatrix[i]
p0Denom += sum(trainMatrix[i])
# 得到每个词在垃圾文档中出现的频率[1/11,2/11,3/11,4/11,0,0,1/11]
p1Vect = [math.log(x/p1Denom) for x in p1Num]
p0Vect = [math.log(x/p0Denom) for x in p0Num]
# p1Vect = p1Num/p1Denom
# p0Vect = p0Num/p0Denom
return p1Vect,p0Vect,pAbusive
朴素贝叶斯分类函数
def classifyNB(vec2Classify, p0Vect,p1Vect,pClass1):
p1VecSum = 0; p0VecSum = 0
for i in range(len(p0Vect)):
p1VecSum += vec2Classify[i] * p1Vect[i]
p0VecSum += vec2Classify[i] * p0Vect[i]
p1 = p1VecSum + math.log(pClass1)
p0 = p0VecSum + math.log(1.0 - pClass1)
# p1 = sum(vec2Classify * p1Vect) + math.log(pClass1)
# p0 = sum(vec2Classify * p0Vect) + math.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
示例:使用naive bayes从个人广告中获取区域倾向
训练出模型后,获取各分类 词条出现概率较高的前十个单词
# -*- coding: utf-8 -*-
# !/usr/bin/env python
import feedparser
import re
import words2vec as bayes
import numpy as np
import operator
def testParse(bigString):
# 去掉单词、数字,字符串长度小于2
lisetOfToken = re.split('\\W*', bigString)
# 全部小写处理
return [tok.lower() for tok in lisetOfToken if len(tok) > 2]
# 取出前三十个高频字符串 with and you for之类
def calcMostFreq(vocabList, fullText):
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
# iteritems 排序
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1),\
reverse=True)
return sortedFreq[:50]
def localWords(feed1, feed0):
docList = []; classList = []; fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
# print(minLen)
for i in range(minLen):
wordList = testParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = testParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
# 创建所有 词向量
vocabList = bayes.creatVocabList(docList)
# 取出前三十个高频字符串
top30Words = calcMostFreq(vocabList, fullText)
# print(top30Words)
# 去掉高频词
for pairW in top30Words:
if pairW[0] in vocabList:
vocabList.remove(pairW[0])
print(vocabList)
print(len(vocabList))
# 留存交叉验证 随机取20个作为测试集 其他为训练集
trainingSet = list(range(2 * minLen)); testSet = []
for i in range(20):
randIndex = int(np.random.uniform(0, len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
# print(testSet)
# 训练模型
trainMat = []; trainClasses = []
for docIndex in trainingSet:
trainMat.append(bayes.bagOfWords2VecMN(list(vocabList),docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = bayes.trainNB0(np.array(trainMat), np.array(trainClasses))
# print('pSpam: ',pSpam)
# 测试数据 查看错误率
errorCount = 0
for docIndex in testSet:
wordVector = bayes.bagOfWords2VecMN(list(vocabList),docList[docIndex])
if bayes.classifyNB(wordVector,p0V,p1V,pSpam) != classList[docIndex]:
errorCount += 1
# print(docList[docIndex], 'error')
print('the error rate is: ', float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def getTopWords(ny,sf):
vocabList,p0V,p1V = localWords(ny,sf)
topNY = []; topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0:
topSF.append((list(vocabList)[i],p0V[i]))
if p1V[i] > -6.0:
topNY.append((list(vocabList)[i],p1V[i]))
sortedSF = sorted(topSF, key=lambda pair: pair[1], reverse=True)
print('SF ***** ***** *****')
top10SF = map(lambda x: x[0], sortedSF[:10])
for i in top10SF:
print(i)
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print('NY ***** ***** *****')
for item in sortedNY[:10]:
print(item[0])
if __name__ == '__main__':
ny = feedparser.parse('http://newyork.craigslist.org/stp/index.rss')
sf = feedparser.parse('http://sfbay.craigslist.org/stp/index.rss')
getTopWords(ny,sf)