参考:https://blog.csdn.net/qq_44907926/article/details/119531324
创建项目步骤:
 1.目标网站:www.itcast.cn
 2.安装虚拟环境
pip install virtualenv
3.创建虚拟环境
virtualenv --always-copy --system-site-packages venv
4.激活虚拟环境
venv\scripts\activate
5.安装twisted(Twisted是基于事件驱动的网络引擎框架)
pip install twisted
6.安装scrapy 爬虫框架
pip install scrapy
7.创建项目
scrapy startproject itcastScrapy
cd itcastScrapy
#itcastSpider  爬虫名字 itcast.cn 爬虫网站
scrapy genspider itcastSpider itcast.cn

 8.创建 main.py 执行scrapy
from scrapy import cmdline
cmdline.execute("scrapy crawl itcastSpider".split())

 9.各文件代码
 9.1 items.py 需要爬取的内容
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
# 需要爬取的内容 建模
class ItcastscrapyItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass
class ItcastItem(scrapy.Item):
    # 爬取老师信息
    name = scrapy.Field()
    # 爬取老师职位
    title = scrapy.Field()
    # 爬取 老师信息
    info = scrapy.Field()
9.2 pipelines.py 管道,保存数据
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
# 管道,保存数据
class ItcastscrapyPipeline:
    def process_item(self, item, spider):
        return item
class UbuntuPipeline(object):
    def __init__(self):
        self.file = open('itcast.json', 'w', encoding='utf-8')
    def process_item(self, item, spider):
        # 将item对象强制转为字典,该操作只能在scrapy中使用
        item = dict(item)
        # 爬虫文件中提取数据的方法每yield一次,就会运行一次
        # 该方法为固定名称函数
        # 默认使用完管道,需要将数据返回给引擎
        # 1.将字典数据序列化
        '''ensure_ascii=False 将unicode类型转化为str类型,默认为True'''
        json_data = json.dumps(item, ensure_ascii=False, indent=2) + ',\n'
        # 2.将数据写入文件
        self.file.write(json_data)
        return item
    def __del__(self):
        self.file.close()
9.3 middlewares.py 自定义中间件文件
9.4 settings.py 设置文件
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# 设置浏览器UA
DEFAULT_REQUEST_HEADERS = {
   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.162 Safari/537.36',
}
# 300 优先级,越小执行越早
ITEM_PIPELINES = {
   # "itcastScrapy.pipelines.ItcastscrapyPipeline": 300,
    "itcastScrapy.pipelines.UbuntuPipeline": 300,
}
9.5 ItcastspiderSpider.py
import scrapy
from ..items import ItcastItem
from bs4 import BeautifulSoup
class ItcastspiderSpider(scrapy.Spider):
    name = "itcastSpider"
    allowed_domains = ["itcast.cn"]
    start_urls = ["https://itcast.cn"]
    def parse(self, response):
        # 获取网页源代码
        # print(response.body.decode())
        soup = BeautifulSoup(response.body.decode(), 'lxml')
        li_list = soup.select('div.head_nav>ul>li')
        for li in li_list:
            if li.text.strip() == '教研团队':
                for a in li.select('a'):
                    href = a['href']
                    yield scrapy.Request(url=href, callback=self.parse_teacher)
                break
    def parse_teacher(self, response):
        # 获取网页源代码
        # print(response.body.decode())
        items = []
        document = BeautifulSoup(response.body.decode(), 'lxml')
        li_list = document.select('div.tea_con div.tea_txt_cur ul li')
        # 遍历教师节点列表
        for li in li_list:
            item = ItcastItem()
            name = li.select(' div.li_txt h3')[0].text
            title = li.select(' div.li_txt h4')[0].text
            info = li.select(' div.li_txt p')[0].text
            item['name'] = name
            item['title'] = title
            item['info'] = info
            items.append(item)
            # 提交所有收集的items
        for item in items:
            yield item
执行结果
 



















