文章推荐系统

本项目利用蜘蛛爬虫抓取文章,然后对文章分类,利用余弦相似度算法计算文本相似度,进行文章推荐。

应用介绍

本项目利用蜘蛛爬虫抓取文章,然后对文章分类,利用余弦相似度算法计算文本相似度,进行文章推荐。

#!/usr/bin/python
# Filename: bayes_sort.py
# _*_ coding:utf-8 _*_

from numpy import *
import re
import random
import pymongo
from bson import ObjectId
import jieba
import sys

def fetchArticalTrain(db): # 获取训练文章
    artical_tag = db.artical_tag.find_one({'catagore':{'$exists':True}, 'is_trained':{'$exists':False}})
    if (not artical_tag):
        exit(1)
    artical = db.artical.find_one({'_id':ObjectId(artical_tag["a_id"])})
    with open("../data_spider/html/" + artical['title_hash'] + ".html", "rb") as f:
        artical_content = f.read().decode("utf-8")
    artical_tag['is_trained'] = 1
    db.artical_tag.save(artical_tag)
    # print(artical_content)
    artical_content = removeLabel(artical_content)
    artical_content = jiebacut(artical_content)
    artical_content = removeStopWords(artical_content)
    return artical_content, artical_tag['catagore']

def fetchArticalClassify(db): # 获取待分类文章
    # artical_tag = db.artical_tag.find_one({'catagore':{'$exists':False}})
    artical_tag = db.artical_tag.find_one({'catagore':{'$exists':True}, 'is_trained':{'$exists':False}})
    if (not artical_tag):
        exit(1)
    artical = db.artical.find_one({'_id':ObjectId(artical_tag["a_id"])})
    with open("../data_spider/html/" + artical['title_hash'] + ".html", "rb") as f:
        artical_content = f.read().decode("utf-8")
    artical_tag['is_trained'] = 1 # 标记完之后就不会拿它去分类了
    db.artical_tag.save(artical_tag)
    # print(artical_content)
    artical_content = removeLabel(artical_content)
    artical_content = jiebacut(artical_content)
    artical_content = removeStopWords(artical_content)
    return artical_content, artical_tag['catagore']

def removeLabel(content): # 去除标签 \ 空格 \ 换行 \ tab
    dr = re.compile(r'<[^>]+>',re.S)
    dd = dr.sub('', content)
    dd = dd.replace("\n",'').replace(' ','').replace("\t",'').replace(".","_")
    # print(dd)
    return dd

def jiebacut(content): # 分词
    seg_list = jieba.cut(content,cut_all=False)
    tmp = []
    for seg in seg_list:
        tmp.append(seg)
    seg_list = tmp
    # print("jieba cut result:", "/ ".join(seg_list))
    return seg_list

def removeStopWords(word_list): # 删除停词
    with open("stopwords.txt", "r") as f:
        for line in f:
            line = line.replace("\n", '')
            while(1):
                if (line in word_list):
                    word_list.remove(line)
                    # print("remove" + line)
                else:
                    break
    # print("remove stop words result:", "/ ".join(word_list))
    return word_list

def trainBayes(word_list, cata_num, db):
    if (not db.bayes_words.find_one({'cata_num':-1})):
        db.bayes_words.insert({'cata_num':-1, 'total':0})
    item = db.bayes_words.find_one({'cata_num':-1})
    item['total'] += len(word_list)
    db.bayes_words.save(item) # 总词数

    item = db.bayes_words.find_one({'cata_num':cata_num})
    if (not item):
        db.bayes_words.insert({'cata_num':cata_num, 'total':0})
        item = db.bayes_words.find_one({'cata_num':cata_num})
    # print(item['total'])
    # print(len(word_list))
    item['total'] = item['total'] + len(word_list)
    for word in word_list:
        if (word in item):
            item[word] += 1
        else:
            item[word] = 1
    # print(item)
    db.bayes_words.save(item)

def classify(word_list, db):
    total_num = db.bayes_words.find_one({'cata_num':-1})['total']
    cata_total = {}
    for item in db.bayes_words.find({"cata_num":{"$gte":0}},{"total":1, "cata_num":1}):
        cata_total[item['cata_num']] = item['total']

    catagores = []
    for cata in db.catagore.find():
        catagores.append(cata['num'])


    cata_probability = {}
    for word in word_list:
        # 计算这个词一共出现了多少次
        word_num = 0
        for cata in catagores:
            item = db.bayes_words.find_one({"cata_num":cata, word:{'$exists':True}},{word:1, "cata_num":1})
            if (item):
                word_num += item[word]
        for cata in catagores:
            item = db.bayes_words.find_one({"cata_num":cata, word:{'$exists':True}},{word:1, "cata_num":1})
            if (item):
                if (cata in cata_probability):
                    cata_probability[cata] += (item[word]/cata_total[cata]) * (cata_total[cata]/total_num) / (word_num/total_num)
                else:
                    cata_probability[cata] = (item[word]/cata_total[cata]) * (cata_total[cata]/total_num) / (word_num/total_num)

    print(cata_probability)
    max = cata_probability[0]
    res = 0
    for cata in cata_probability:
        if (max < cata_probability[cata]):
            max = cata_probability[cata]
            res = cata
    print(str(res) + "is the max catagore.")
    return res

if __name__ == '__main__':
    client = pymongo.MongoClient(host='127.0.0.1', port=27017)
    db = client['ArticalRecommend']
    if (sys.argv[1] == "train"):
        word_list,cata_num = fetchArticalTrain(db)
        trainBayes(word_list, cata_num, db)
    elif (sys.argv[1] == "classify"):
        word_list,cata_num = fetchArticalClassify(db)
        res = classify(word_list, db)
        if (res == cata_num):
            print("1111")
        else:
            print("2222")
    else:
        print("para error. train/classify")
    exit(0)

文件列表(部分)

名称 大小 修改日期
bayes_sort.py1.54 KB2017-06-12
extract_tag.py0.89 KB2017-06-12
run_classify.sh0.16 KB2017-06-12
run_extract_tag.sh0.16 KB2017-06-12
run_train.sh0.16 KB2017-06-12
similarity_pair_push.py0.52 KB2017-06-12
similarity_queue_process.py1.45 KB2017-06-12
stopwords.txt8.51 KB2017-06-12
__init__.py0.00 KB2017-06-12
mongo_rsa.conf0.17 KB2017-06-12
mongo_rsb.conf0.17 KB2017-06-12
mongo_rsc.conf0.17 KB2017-06-12
setting.js0.13 KB2017-06-12
common.py0.14 KB2017-06-12
items.py0.28 KB2017-06-12
middlewares.py1.37 KB2017-06-12
pipelines.py0.61 KB2017-06-12
ImageUrl_Process.py0.56 KB2017-06-12
__init__.py0.00 KB2017-06-12
settings.py1.40 KB2017-06-12
toutiao.py1.62 KB2017-06-12
__init__.py0.13 KB2017-06-12
toutiao.cpython-36.pyc1.73 KB2017-06-12
__init__.cpython-36.pyc0.14 KB2017-06-12
__init__.py0.00 KB2017-06-12
common.cpython-36.pyc0.28 KB2017-06-12
items.cpython-36.pyc0.38 KB2017-06-12
pipelines.cpython-36.pyc0.80 KB2017-06-12
settings.cpython-36.pyc0.33 KB2017-06-12
__init__.cpython-36.pyc0.14 KB2017-06-12

立即下载

相关下载

[磁耦合谐振buckss仿真] 无线电能传输技术可有效地解决电源接入问题,使充电过程便捷、安全,解决了传统依靠电导体直接进行物理接触的电源直接接触式输电模式所带来的插电火花、积碳、不易维护、易产生磨损,特别是在特殊环境下用电存在的安全隐患等问题。
[QT学习日记篇01(1)-QT界面初探- *.pro文件详解] Qt基础课程完结项目,完成一款小游戏并封装:翻金币游戏,通过点击金币进行翻面,让所有金币为同一面就游戏通过进入下一关。 过程中会使用前面学到的 “信号和槽”,“Qt图片资源显示”,“Qt播放音频”,“Qt绘图函数”,“Qt消息控件”等等知识。是一次前面所学知识的汇总。
[MMC模块化多电平换流器常用调制策略的对比分析] 在Matlab/Simulink环境下分别搭建了桥臂数量为6和10的两种调制方法的仿真模型,分析了模块数量与调制方法对输出的正弦电压的影响。
[svpwm同步调制] 同步调制下,通过把载波的频率固定为参考波频率的整数倍,同时固定两者的相位关系。通过这种方法可以消除3的倍数次谐波,并且通过适合pwm波正负半周期对称(N载波比取奇数),消除偶次谐波,总体同步调制所含谐波只为6k+1也就是5 7 11 13次谐波等。
[避雷器comsol仿真] comsol实现避雷器的电,磁,热仿真计算模型
[FPGA车牌识别代码

评论列表 共有 0 条评论

暂无评论

微信捐赠

微信扫一扫体验

立即
上传
发表
评论
返回
顶部