Python医生招聘数据

此python项目是爬取医生招聘数据,并且分析儿科医生的生存处境,还可以了解公立医院,民营医院,医药企业,生物企业,科研院校,网络出版,其他单位,诊所/药房的一些基本数据。

应用介绍

import os
import json
import time
import random
import requests
from pymongo import MongoClient

class CrawlJob(object):
    def __init__(self):
        self.list_header = {'Host': 'api.jobmd.cn',
                            'Content-Type': 'application/x-www-form-urlencoded',
                            'Accept-Encoding': 'br, gzip, deflate',
                            'Connection': 'keep-alive',
                            'Accept': '*/*',
                            'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_4 like Mac OS X)'
                                          ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16D57 Mi'
                                          'croMessenger/7.0.3(0x17000321) NetType/4G Language/en',
                            'Referer': 'https://servicewechat.com/wx9a1e763032f69003/124/page-frame.html',
                            'Content-Length': '105',
                            'Accept-Language': 'en-us'}
        self.list_url = 'https://api.jobmd.cn/api/wechatMiniApp/search'
        self.detail_header = {'Host': 'api.jobmd.cn',
                              'DXY-WXAPP-AUTH-TOKEN': '[object Object]',
                              'Content-Type': 'application/json',
                              'Connection': 'keep-alive',
                              'Accept': '*/*',
                              'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_4 like Mac OS X)'
                                            ' AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/16D57'
                                            ' MicroMessenger/7.0.3(0x17000321) NetType/4G Language/en',
                              'Referer': 'https://servicewechat.com/wx2c8b5efe895460dc/14/page-frame.html',
                              'Accept-Language': 'en-us',
                              'Accept-Encoding': 'br, gzip, deflate'}
        self.company_type = {'公立医院': 1, '民营医院': 2, '医药企业': 3, '生物企业': 4,
                             '科研院校': 5, '网络出版': 6, '其他单位': 7, '诊所/药房': 8}
        self.area_code = None
        host = os.environ.get('MONGODB_HOST', '127.0.0.1')  # 本地数据库
        port = os.environ.get('MONGODB_PORT', '27017')  # 数据库端口
        mongo_url = 'mongodb://{}:{}'.format(host, port)
        mongo_db = os.environ.get('MONGODB_DATABASE', 'DingXiang')
        client = MongoClient(mongo_url)
        self.db = client[mongo_db]
        self.db['erke'].create_index('id', unique=True)  # 以m端链接为主键进行去重
        self.all_id = []
    def get_area_code(self):
        url = 'https://assets.dxycdn.com/core/widgets/cascading-list-v2/data/location.js?t=20180226&t=2019324'
        res = requests.get(url)
        data = json.loads(res.text.replace('\n    ', '').replace('\n', '')[19:-25])
        area = []
        code = []
        for dist in data:
            area.append(dist['label'])
            code.append(dist['key'])
        self.area_code = dict(zip(area, code))
        with open('area_code.txt', 'w') as f:
            f.write(str(self.area_code))
    def get_job_id(self, wd):
        with open('area_code.txt', 'r') as f:
            self.area_code = f.read()
            self.area_code = eval(self.area_code)
        for area, code in self.area_code.items():
            for c_type, num in self.company_type.items():
                page = 1
                has_more = 1
                while has_more:
                    post_data = {'wd': wd,
                                 'locations': str(code),
                                 'pageSize': '10',
                                 'pageNo': str(page),
                                 'salary': None,
                                 'companyType': str(num),
                                 'grade': None,
                                 'jobType': None,
                                 'jobYear': None}
                    try:
                        host, port = self.get_proxy_ip()
                        proxies = {"http": "http://{}:{}".format(host, port),
                                   "https": "http://{}:{}".format(host, port), }
                        res_list = requests.post(url=self.list_url,
                                                 headers=self.list_header,
                                                 data=post_data,
                                                 proxies=proxies,
                                                 timeout=3)
                        data = res_list.json()
                        if data['success'] and data['results']['pageBean']['totalCount'] > 0:
                            print('链接请求成功![locations:{}, pageNo:{}, companyType:{}]'.format(code,
                                                                                            page,
                                                                                            num))
                            for li in data['results']['items']:
                                job_id = li['id']
                                self.get_job_detail(job_id, area)
                            total = data['results']['pageBean']['totalCount']
                            if total / 10 <= page:
                                has_more = 0
                            page += 1
                            time.sleep(random.random() * 2)
                        else:
                            print('该页无数据![locations:{}, pageNo:{}, companyType:{}]'.format(code,
                                                                                           page,
                                                                                           num))
                            has_more = 0
                            time.sleep(random.random()*5)
                    except:
                        print('链接请求不成功![locations:{}, pageNo:{}, companyType:{}]'.format(code,
                                                                                         page,
                                                                                         num))
    def get_job_detail(self, job_id, area):
        if job_id in self.all_id:
            print('Job_id{}重复!'.format(job_id))
        else:
            detail_url = 'https://api.jobmd.cn/api/wechatMiniApp/entwork?id={}&recommendSize=0'.format(job_id)
            retry = 1
            while retry:
                try:
                    host, port = self.get_proxy_ip()
                    proxies = {"http": "http://{}:{}".format(host, port),
                               "https": "http://{}:{}".format(host, port), }
                    res_job = requests.get(url=detail_url,
                                           headers=self.detail_header,
                                           proxies=proxies,
                                           timeout=3)
                    de_data = res_job.json()
                    if de_data['success']:
                        item = de_data['results']['entwork']
                        item['area'] = area
                        self.db['fuchanke'].update_one({'id': item['id']}, {'$set': item}, upsert=True)
                        print('成功保存数据:{}!'.format(item))
                    retry = 0
                    time.sleep(random.random()*2)
                    self.all_id.append(job_id)
                except:
                    print('id{}请求不成功!'.format(job_id))
    @staticmethod
    def get_proxy_ip():
        proxy_url = 'https://proxy.horocn.com/api/proxies?order_id=V8SX1629045840556986' \
                    '&num=1&format=json&line_separator=win&can_repeat=yes'
        res = requests.get(proxy_url)
        host = res.json()[0]['host']
        port = res.json()[0]['port']
        return host, port

if __name__ == '__main__':
    crawler = CrawlJob()
    crawler.get_area_code()
    crawler.get_job_id('妇产科')

爬取医生招聘数据,并且分析儿科医生的生存处境。

适应Python数据分析学习者、Python爬虫学习者、数据可视化学习者Pandas使用者。

文件列表(部分)

名称 大小 修改日期
all_data.csv1,210.26 KB2019-12-30
dingxiang_job.py2.29 KB2019-12-30

立即下载

相关下载

[pyomo能源枢纽优化建模] 该存储库包含标准“能源枢纽”模型的简单实现,该模型用于单个分散式多能源系统(D-MES)的优化设计和操作,同时考虑建筑物改造选项;模型在Pyomo中构建。
[流畅的Python] 文档致力于帮助 Python 开发人员挖掘这门语言及相关程序库的优秀特性,避免重复劳动, 同时写出简洁、流畅、易读、易维护,并且具有地道 Python 风格的代码。
[Flask Web开发:基于Python的Web应用开发实战] 此文档是Flask Web开发:基于Python的Web应用开发实战。 此文档适合熟悉 Python 编程,有意通过 Flask 全面掌控 Web 开发的程序员学习参考。
[Effective Python.编写高质量Python代码的59个有效方法] 此文档是Effective Python.编写高质量Python代码的59个有效方法。 文档中的各项条目,适用于Python3和Python2。对于Jython、IronPython等其他运行时环境,大部分条目应该同样适用。
[Python编程的核心知识点] 此文档是Python编程的核心知识点。 文档中有函数一、函数二、基础知识1、基础知识2、面向对象编程、模块、数据类型、文件对象.........了解详情请下载附件。
[Python编程快速上手 让繁琐工作自动化] 此文档是Python编程快速上手 让繁琐工作自动化。 文档的目的,不仅是介绍 Python语言的基础知识,而且还通过项目实践教会读者如何应用这些知识和技能。

评论列表 共有 0 条评论

暂无评论

微信捐赠

微信扫一扫体验

立即
上传
发表
评论
返回
顶部