分类
分类方法:
- 朴素贝叶斯分类法
- 费舍尔分类法
import reimport math# 分词def getwords(doc): splitter = re.compile('\\W*') # 根据非字母字符拆分单词,并转化为小写 words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20] return dict([(word, 1) for word in words])class classifier: def __init__(self, getfeatures, filename=None): # 统计特征/分类组合的数量 self.featurecatacount = {} # 每个分类中文档个数 self.catagorycount = {} self.getfeatures = getfeatures # featurecatacount def incfeaturecount(self, feature, catagory): self.featurecatacount.setdefault(feature, {}) self.featurecatacount[feature].setdefault(catagory, 0) self.featurecatacount[feature][catagory] += 1 # 增加catagorycount def inccatacount(self, catagory): self.catagorycount.setdefault(catagory, 0) self.catagorycount[catagory] += 1 # 某一特征出现在某一分类中的次数 def featurecount(self, feature, catagory): if feature in self.featurecatacount and catagory in self.featurecatacount[feature]: return float(self.featurecatacount[feature][catagory]) return 0.0 # 属于某一分类项的内容数量 def catacount(self, catagory): if catagory in self.catagorycount: return float(self.catagorycount[catagory]) return 0.0 # 所有内容项的数量 def totalcount(self): return sum(self.catagorycount.values()) # 所有分类列表 def catagories(self): return self.catagorycount.keys() # 训练分类器 def train(self, item, catagory): features = self.getfeatures(item) # 增加该分类catagory下的feature值 for f in features: self.incfeaturecount(f, catagory) # 增加分类的计数值 self.inccatacount(catagory) # 计算单词在分类中出现的概率 def featureprob(self, feature, catagory): count = self.catacount(catagory) if count == 0: return 0 # 特征在分类中出现的总次数,除以分类中包含内容项的总数 return self.featurecount(feature, catagory) / count def weightedprob(self, feature, catagory, prf, weight=1, ap=0.5): """ 计算加权平均的概率 Args: feature:特征 catagory:分类 weight:权重 ap:概率推荐的初始值 """ # 计算当前的概率值 basicprob = prf(feature, catagory) # 统计特征在所有分类中出现的次数 totals = sum([self.featurecount(feature, c) for c in self.catagories()]) # 计算加权平均 bp = (weight * ap + totals * basicprob) / (weight + totals) return bp class naivebayes(classifier): """ 朴素贝叶斯分类器:假设被组合的各个概率是彼此独立的,那么总的概率就是各个概率的乘积 """ def __init__(self, getfeatures): classifier.__init__(self, getfeatures) self.thresholds = {} def setthreshold(self, cat, threshold): self.thresholds[cat] = threshold def getthrreshold(self, cat): if cat not in self.thresholds: return 1.0 return self.thresholds[cat] def docprob(self, item, catagory): features = self.getfeatures(item) # 将所有特征的概率相乘 p = 1 for f in features: p *= self.weightedprob(f, catagory, self.featureprob) return p def prob(self, item, catagory): """ 贝叶斯定理:计算条件调换之后的概率,P(A|B) = P(B|A) * P(A) / P(B) """ catprob = self.catacount(catagory) / self.totalcount() docprob = self.docprob(item, catagory) return catprob * docprob def classify(self, item, default=None): probs = {} # 寻找最大概率的分类 max = 0.0 for cat in self.catagories(): probs[cat] = self.prob(item, cat) if probs[cat] > max: max = probs[cat] best = cat # 确保概率超出阈值*最大概率 for cat in probs: if cat == best: continue if probs[cat] * self.getthrreshold(best) > probs[best]: return default return bestclass fisherclassifier(classifier): """ 费舍尔分类器 """ def __init__(self, getfeatures): classifier.__init__(self, getfeatures) self.minimums = {} def setminimums(self, cat, minimums): self.minimums[cat] = minimums def getminimums(self, cat): if cat not in self.minimums: return 0 return self.minimums[cat] def cprob(self, f, cat): # 特征在该分类中出现的频率 clf = self.featureprob(f, cat) # 特征在所有分类中出现的频率 freqsum = sum([self.featureprob(f, c) for c in self.catagories()]) # 概率等于特征在该分类中出现的频率除以总的频率 p = clf / freqsum return p def fisherprob(self, item, cat): # 将所有概率值相乘 p = 1 features = self.getfeatures(item) for f in features: p *= self.weightedprob(f, cat, self.cprob) # 取自然对数,并乘以-2 fscore = -2 * math.log(p) # 利用倒置对数卡方函数求得概率 return self.invchi2(fscore, len(features) * 2) def invchi2(self, chi, df): m = chi / 2.0 summ = term = math.exp(-m) for i in range(1, df // 2): term *= m / i summ += term return min(summ, 1.0) def classify(self, item, default=None): # 循环遍历并寻找最佳结果 best = default max = 0.0 for c in self.catagories(): p = self.fisherprob(item, c) # 确保其超过下限值 if p > self.getminimums(c) and p > max: best = c max = p return best def sampletrain(cl): cl.train('Nobody owns the water.','good') cl.train('the quick rabbit jumps fences','good') cl.train('buy pharmaceuticals now','bad') cl.train('make quick money at the online casino','bad') cl.train('the quick brown fox jumps','good')
cl = classifier(getwords)cl.train('the quick brown fox jumps over the lazy dog', 'good')cl.train('make quick money in the online cassino', 'bad')print cl.featurecount('quick', 'good')print cl.featurecount('quick' ,'bad')sampletrain(cl)print cl.featureprob('quick', 'good')print cl.weightedprob('quick', 'good', cl.featureprob)
1.01.00.750.708333333333
# 贝叶斯分类法测试bayes = naivebayes(getwords)sampletrain(bayes)print bayes.prob('quick rabbit', 'good')print bayes.prob('quick rabbit', 'bad')print bayes.classify('quick rabbit', 'unknown')print bayes.classify('quick money', 'unknown')bayes.setthreshold('bad', 3.0)print bayes.classify('quick money', 'unknown')for i in range(10): sampletrain(bayes)print bayes.classify('quick money', 'unknown')
0.156250.05goodbadunknownbad
# 费舍尔分类法测试fisher = fisherclassifier(getwords)sampletrain(fisher)fisher.cprob('quick', 'good')print fisher.fisherprob('quick rabbit', 'good')print fisher.fisherprob('quick rabbit', 'bad')print fisher.classify('quick rabbit')print fisher.classify('quick money')fisher.setminimums('bad', 0.8)print fisher.classify('quick money')fisher.setminimums('bad', 0.4)print fisher.classify('quick money')
0.780139865890.356335962833goodbadgoodbad