博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
机器学习 — 文档过滤
阅读量:6547 次
发布时间:2019-06-24

本文共 6973 字,大约阅读时间需要 23 分钟。

分类

分类方法:

  1. 朴素贝叶斯分类法
  2. 费舍尔分类法
import reimport math# 分词def getwords(doc):    splitter = re.compile('\\W*')    # 根据非字母字符拆分单词,并转化为小写    words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20]        return dict([(word, 1) for word in words])class classifier:    def __init__(self, getfeatures, filename=None):        # 统计特征/分类组合的数量        self.featurecatacount = {}        # 每个分类中文档个数        self.catagorycount = {}        self.getfeatures = getfeatures        # featurecatacount    def incfeaturecount(self, feature, catagory):        self.featurecatacount.setdefault(feature, {})        self.featurecatacount[feature].setdefault(catagory, 0)        self.featurecatacount[feature][catagory] += 1        # 增加catagorycount    def inccatacount(self, catagory):        self.catagorycount.setdefault(catagory, 0)        self.catagorycount[catagory] += 1        # 某一特征出现在某一分类中的次数    def featurecount(self, feature, catagory):        if feature in self.featurecatacount and catagory in self.featurecatacount[feature]:            return float(self.featurecatacount[feature][catagory])        return 0.0        # 属于某一分类项的内容数量    def catacount(self, catagory):        if catagory in self.catagorycount:            return float(self.catagorycount[catagory])        return 0.0        # 所有内容项的数量    def totalcount(self):        return sum(self.catagorycount.values())        # 所有分类列表    def catagories(self):        return self.catagorycount.keys()        # 训练分类器    def train(self, item, catagory):        features = self.getfeatures(item)        # 增加该分类catagory下的feature值        for f in features:            self.incfeaturecount(f, catagory)                # 增加分类的计数值        self.inccatacount(catagory)        # 计算单词在分类中出现的概率    def featureprob(self, feature, catagory):        count = self.catacount(catagory)        if count == 0:            return 0                # 特征在分类中出现的总次数,除以分类中包含内容项的总数        return self.featurecount(feature, catagory) / count        def weightedprob(self, feature, catagory, prf,  weight=1, ap=0.5):        """        计算加权平均的概率                Args:            feature:特征            catagory:分类            weight:权重            ap:概率推荐的初始值        """                # 计算当前的概率值        basicprob = prf(feature, catagory)                # 统计特征在所有分类中出现的次数        totals = sum([self.featurecount(feature, c) for c in self.catagories()])                # 计算加权平均        bp = (weight * ap + totals * basicprob) / (weight + totals)        return bp    class naivebayes(classifier):    """    朴素贝叶斯分类器:假设被组合的各个概率是彼此独立的,那么总的概率就是各个概率的乘积    """    def __init__(self, getfeatures):        classifier.__init__(self, getfeatures)        self.thresholds = {}            def setthreshold(self, cat, threshold):        self.thresholds[cat] = threshold        def getthrreshold(self, cat):        if cat not in self.thresholds:            return 1.0        return self.thresholds[cat]        def docprob(self, item, catagory):        features = self.getfeatures(item)                # 将所有特征的概率相乘        p = 1        for f in features:            p *= self.weightedprob(f, catagory, self.featureprob)        return p        def prob(self, item, catagory):        """        贝叶斯定理:计算条件调换之后的概率,P(A|B) = P(B|A) * P(A) / P(B)        """        catprob = self.catacount(catagory) / self.totalcount()        docprob = self.docprob(item, catagory)                return catprob * docprob        def classify(self, item, default=None):        probs = {}        # 寻找最大概率的分类        max = 0.0        for cat in self.catagories():            probs[cat] = self.prob(item, cat)            if probs[cat] > max:                max = probs[cat]                best = cat        # 确保概率超出阈值*最大概率        for cat in probs:            if cat == best:                continue            if probs[cat] * self.getthrreshold(best) > probs[best]:                return default            return bestclass fisherclassifier(classifier):    """    费舍尔分类器        """    def __init__(self, getfeatures):        classifier.__init__(self, getfeatures)        self.minimums = {}            def setminimums(self, cat, minimums):        self.minimums[cat] = minimums            def getminimums(self, cat):        if cat not in self.minimums:            return 0        return self.minimums[cat]                def cprob(self, f, cat):        # 特征在该分类中出现的频率        clf = self.featureprob(f, cat)                # 特征在所有分类中出现的频率        freqsum = sum([self.featureprob(f, c) for c in self.catagories()])                # 概率等于特征在该分类中出现的频率除以总的频率        p = clf / freqsum        return p        def fisherprob(self, item, cat):        # 将所有概率值相乘        p = 1        features = self.getfeatures(item)        for f in features:            p *= self.weightedprob(f, cat, self.cprob)                    # 取自然对数,并乘以-2        fscore = -2 * math.log(p)                # 利用倒置对数卡方函数求得概率        return self.invchi2(fscore, len(features) * 2)        def invchi2(self, chi, df):        m = chi / 2.0        summ = term = math.exp(-m)        for i in range(1, df // 2):            term *= m / i            summ += term        return min(summ, 1.0)        def classify(self, item, default=None):        # 循环遍历并寻找最佳结果        best = default        max = 0.0        for c in self.catagories():            p = self.fisherprob(item, c)            # 确保其超过下限值            if p > self.getminimums(c) and p > max:                best = c                max = p        return best                def sampletrain(cl):    cl.train('Nobody owns the water.','good')    cl.train('the quick rabbit jumps fences','good')    cl.train('buy pharmaceuticals now','bad')    cl.train('make quick money at the online casino','bad')    cl.train('the quick brown fox jumps','good')
cl = classifier(getwords)cl.train('the quick brown fox jumps over the lazy dog', 'good')cl.train('make quick money in the online cassino', 'bad')print cl.featurecount('quick', 'good')print cl.featurecount('quick' ,'bad')sampletrain(cl)print cl.featureprob('quick', 'good')print cl.weightedprob('quick', 'good', cl.featureprob)
1.01.00.750.708333333333
# 贝叶斯分类法测试bayes = naivebayes(getwords)sampletrain(bayes)print bayes.prob('quick rabbit', 'good')print bayes.prob('quick rabbit', 'bad')print bayes.classify('quick rabbit', 'unknown')print bayes.classify('quick money', 'unknown')bayes.setthreshold('bad', 3.0)print bayes.classify('quick money', 'unknown')for i in range(10):    sampletrain(bayes)print bayes.classify('quick money', 'unknown')
0.156250.05goodbadunknownbad
# 费舍尔分类法测试fisher = fisherclassifier(getwords)sampletrain(fisher)fisher.cprob('quick', 'good')print fisher.fisherprob('quick rabbit', 'good')print fisher.fisherprob('quick rabbit', 'bad')print fisher.classify('quick rabbit')print fisher.classify('quick money')fisher.setminimums('bad', 0.8)print fisher.classify('quick money')fisher.setminimums('bad', 0.4)print fisher.classify('quick money')
0.780139865890.356335962833goodbadgoodbad

转载于:https://www.cnblogs.com/sunshine-2015/p/6637589.html

你可能感兴趣的文章
第二章家庭作业 2.78
查看>>
Android 下拉刷新上拉载入 多种应用场景 超级大放送(上)
查看>>
Risc-V指令集
查看>>
Python进阶04 函数的参数对应
查看>>
C语言结构体的“继承”
查看>>
WebView之禁止调用第三方浏览器
查看>>
POJ 3468 A Simple Problem with Integers(线段树 区间更新)
查看>>
安装apr-1.6.3报错[cannot remove `libtoolT’: No such file or directory]解决方法
查看>>
C# 操作Excel,控制格式[转]
查看>>
iOS开发中一些常用的属性
查看>>
Git 使用教程
查看>>
spring--基于ioc的配置文件方式
查看>>
“小 U”- UI自动化测试平台 [自动化测试平台开发实战 - 基于 Spring Boot + Kotlin]...
查看>>
easyui的一些使用方法
查看>>
Vue使用过程中的可能会遇到的几个问题
查看>>
TIMO 后台管理系统 v2.0.1 发布,加入 jwt 身份验证组件,基于 Spring Boot
查看>>
Java 11 将至,不妨了解一下 Oracle JDK 之外的版本
查看>>
Log4j_学习_03_自己动手封装log工具
查看>>
Redis的各项功能解决了哪些问题?
查看>>
FastAdmin 极速后台管理框架 1.0.0.20190301_beta
查看>>