3-1课堂笔记
import os import json import requests from bs4 import BeautifulSoup # 数据采集基础知识豆瓣读书T250的数据的获取 def getHTML(n): # 获取每一张含有25本书的网页n为页码-1 url https://book.douban.com/top250 header { user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36} r requests.get(url, headersheader, params{start: n * 25}) return r.text # 用beautifulsoup4库解析网页用lxml解析器解析器也要安装 def getlistData(html): soup BeautifulSoup(html, lxml) books soup.select(tr) # print(books) booklist [] for book in books: bookdic {} tds book.select(td) # 接下来的内容真的比较麻烦~耐心点吧~ bookdic[书名] tds[1].div.a.text.strip().split(\n)[0] bookdic[基本信息] tds[1].p.text booklist.append(bookdic) return booklist allbooks [] for i in range(10): html getHTML(i) page getlistData(html) allbooks.append(page) def savejson(data,path,filename): jData json.dumps(data,indent2,ensure_asciiFalse) if not os.path.exists(path): os.makedirs(path) with open(pathfilename,w,encodingutf-8) as f: f.write(jData) savejson(allbooks,data/,douban2520.json)
本文来自互联网用户投稿,该文观点仅代表作者本人,不代表本站立场。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如若转载,请注明出处:http://www.coloradmin.cn/o/2425802.html
如若内容造成侵权/违法违规/事实不符,请联系多彩编程网进行投诉反馈,一经查实,立即删除!