💛前情提要💛
本文是传知代码平台中的相关前沿知识与技术的分享~
接下来我们即将进入一个全新的空间,对技术有一个全新的视角~
本文所涉及所有资源均在传知代码平台可获取
以下的内容一定会让你对AI 赋能时代有一个颠覆性的认识哦!!!
以下内容干货满满,跟上步伐吧~
📌导航小助手📌
- 💡本章重点
- 🍞一. 概述
- 🍞二. 演示效果
- 🍞三.核心逻辑
- 🫓总结
💡本章重点
- 利用scrapy框架练习爬虫
🍞一. 概述
运用Python语言编程知识及实现网络数据采集的各种Python第三方库、Scrapy框架等实现技术爬取网页信息,要求爬取的网页信息至少包括两种类型:标题列表页(该页要包括分页功能)和详情页。

🍞二. 演示效果
整体框架

文件导出

数据库导出

🍞三.核心逻辑
编写对应蜘蛛程序
    def start_requests(self):
        # 爬列表页 只爬1页
        for page in range(1, 2):
            yield Request(url="https://jobs.51job.com/beijing/p{}".format(page), callback=self.parse)
    def parse(self, response: HtmlResponse, **kwargs):
        # time.sleep(3)
        # 查看网页内容
        # print(response.text)
        sel = Selector(response)
        # 浏览器F12 选中目标,鼠标右击copy selector
        # 得到原始css选择器 body > div.maincenter > div.mcon > div.left > div.detlist.gbox > div:nth-child(7)
        title_text = sel.css('title::text').extract_first()
        # title_text = sel.xpath('//title/text()').extract_first()
        # 测试一下标题能不能获取到,如果不能,几乎可以肯定有问题
        print("title_text:", title_text)
        list_items = sel.css('div.detlist.gbox > div')
        for list_item in list_items:
            job_item = JobItem()
            job_id = list_item.css('input::attr(value)').extract_first()
            title = list_item.css('p.info > span.title > a::text').extract_first()
            location = list_item.css('p.info > span.location.name::text').extract_first()
            salary = list_item.css('p.info > span.location:not(.name)::text').extract_first()
            # 得到的是 '学历要求:本科' , 需要处理一下得到 '本科'
            degree = list_item.css('p.order::text').extract_first().split(':')[1].strip()
            # 详情页面
            detail_url = list_item.css('p.info > span.title > a::attr(href)').extract_first()
            print("test:", job_id, title, location, salary, degree, detail_url)
            job_item['job_id'] = job_id
            job_item['title'] = title
            job_item['location'] = location
            job_item['salary'] = salary
            job_item['degree'] = degree
            yield Request(url=detail_url,
                          callback=parse_detail,
                          cb_kwargs={'item': job_item})
def parse_detail(response: HtmlResponse, **kwargs):
    job_item = kwargs['item']
    sel = Selector(response)
    # 原始css选择器 div.tCompany_main > div.tBorderTop_box> div.tmsg.inbox
    company_detail = sel.css('div.tmsg.inbox::text').extract_first()
    print('company_detail:', company_detail)
    job_item['company_detail'] = company_detail
    yield job_item
构造Items
class JobItem(scrapy.Item):
    job_id = scrapy.Field()
    title = scrapy.Field()
    location = scrapy.Field()
    salary = scrapy.Field()
    degree = scrapy.Field()
    company_detail = scrapy.Field()
编写文件管道(Excel)
class ExcelPipeline:
    def __init__(self):
        self.wb = openpyxl.Workbook()
        self.ws = self.wb.active
        self.ws.title = 'Jobs'
        self.ws.append(['职位ID', '职位',
                        '工作地点', '薪资范围',
                        '学历要求', '公司详情'])
    def open_spider(self, spider):
        pass
    def close_spider(self, spider):
        self.wb.save('51jobs.xlsx')
    def process_item(self, item, spider):
        company_detail, degree, job_id, location, salary, title = get_infos(item)
        # self.ws.append([job_id, title, location, salary, degree, company_detail])
        self.ws.append((job_id, title, location, salary, degree, company_detail))
        return item
编写数据库管道及建表
class DbPipeline:
    def __init__(self):
        self.conn = pymysql.connect(host='localhost', port=3306, user='root', password='root',
                                    db='spyder', charset='utf8mb4')
        self.cursor = self.conn.cursor()
    def close_spider(self, spider):
        print('commit------------------------------')
        self.conn.commit()
        self.conn.close()
    def process_item(self, item, spider):
        company_detail, degree, job_id, location, salary, title = get_infos(item)
        self.cursor.execute('insert into tb_51job_items (job_id, title, location, salary, degree, company_detail) '
                            'values (%s,%s,%s,%s,%s,%s)',
                            (job_id, title, location, salary, degree, company_detail))
        return item
CREATE TABLE spyder.tb_51job_items (
	job_id varchar(100) NULL COMMENT '职位ID',
	title varchar(100) NULL COMMENT '职位',
	location varchar(100) NULL COMMENT '工作地点',
	salary varchar(100) NULL COMMENT '薪资范围',
	`degree` varchar(100) NULL COMMENT '学历要求',
	company_detail varchar(2000) NULL COMMENT '公司详情'
)
ENGINE=InnoDB
DEFAULT CHARSET=utf8mb4
COLLATE=utf8mb4_0900_ai_ci;
配置管道及优先级
ITEM_PIPELINES = {
   "spider51job.pipelines.ExcelPipeline": 300,
   "spider51job.pipelines.DbPipeline": 400
}
编写中间件代码
    def __init__(self):
        self.browser = create_chrome_driver(headless=False)
        self.browser.get('https://jobs.51job.com')
        # 初始化时, 先访问主页, 得到cookie信息
        cookie_file = '51job_cookies.json'
        # 这边也可以先人工运行test_generate_cookies.py提前生成好cookies信息,下面直接add_cookies使用就可以了
        generate_cookies(self.browser, cookie_file)
        add_cookies(self.browser, cookie_file)
    def __del__(self):
        # 爬完关闭浏览器
        self.browser.close()
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        # Must either:
        # - return None: continue processing this request
        # - or return a Response object
        # - or return a Request object
        # - or raise IgnoreRequest: process_exception() methods of
        #   installed downloader middleware will be called
        self.browser.get(request.url)
        # time.sleep(5)
        if request.url.startswith('https://jobs.51job.com/beijing/p'):
            wait_obj = WebDriverWait(self.browser, 10)
            wait_obj.until(expected_conditions.presence_of_element_located((By.CSS_SELECTOR, '#searchForm')))
            print('爬的时候可能也被反爬,需要人工滑块!')
        # page_source这个可以获取到动态页面的源代码
        return HtmlResponse(url=request.url, body=self.browser.page_source,
                            encoding='utf-8', request=request)
🫓总结
综上,我们基本了解了“一项全新的技术啦” 🍭 ~~
恭喜你的内功又双叒叕得到了提高!!!
感谢你们的阅读😆
后续还会继续更新💓,欢迎持续关注📌哟~
💫如果有错误❌,欢迎指正呀💫
✨如果觉得收获满满,可以点点赞👍支持一下哟~✨
【传知科技 – 了解更多新知识】



















