环境:python3.6
需要安装:
安装:pip install pdfkit
电脑需要安装wkhtmltopdf插件,下载地址https://wkhtmltopdf.org/downloads.html
其他:requests,BeautifulSoup4
脚本:
#coding=utf-8 ''' 安装:pip install pdfkit 电脑需要安装wkhtmltopdf插件 https://wkhtmltopdf.org/downloads.html ''' import requests,pdfkit,os from bs4 import BeautifulSoup html=""" <DOCTYPE html> <html> <head> <meta charset="UTF-8"> </head> <body> {content} </body> </html> """ options = { 'page-size': 'Letter', 'margin-top': '0.75in', 'margin-right': '0.75in', 'margin-bottom': '0.75in', 'margin-left': '0.75in', 'encoding': "UTF-8", 'custom-header': [ ('Accept-Encoding', 'gzip') ], 'cookie': [ ('cookie-name1', 'cookie-value1'), ('cookie-name2', 'cookie-value2'), ], 'outline-depth': 10, } path_wkthmltopdf = r'C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe' config = pdfkit.configuration(wkhtmltopdf=path_wkthmltopdf) def spair_kecheng_list(URL): ''' 获取所有图书,返回list :param URL: :return: ''' req=requests.get(URL) shop=BeautifulSoup(req.text,'html.parser') a=[] for i in shop.select(".rowList"): for j in i.select("li a"): #print(j["title"],"http://www.maiziedu.com"+j["href"]) a.append((str(j["title"]).replace("学习",""),"http://www.maiziedu.com"+j["href"])) return a def spair_kecheng_wenzhang_list(url,book_name): ''' 通过一本书的URL获取所有章节,下载所有的章节保存到对应dir,命名1。html 2.html :param url: :return: ''' try: os.makedirs(book_name) except Exception: pass req=requests.get(url) shop=BeautifulSoup(req.text,'html.parser') #print(shop.select(".wikiPageNavMenu .navBar li")) a=[] for index,i in enumerate(shop.select(".wikiPageNavMenu .navBar li a")): print(i.text,"http://www.maiziedu.com"+i['href']) zhangjie_req=requests.get("http://www.maiziedu.com"+i['href']) zhangjie_shop=BeautifulSoup(zhangjie_req.text,'html.parser') wenzhang_html=html.format(content=str(zhangjie_shop.select(".cont")[0]).replace("【本文由麦子学院独家原创,转载请注明出处并保留原文链接】","").replace("/uploads/","http://www.maiziedu.com/uploads/")) f=open(book_name+os.sep+str(index+1)+".html",'w',encoding='utf-8') f.write(wenzhang_html) f.close() a.append(book_name+os.sep+str(index+1)+".html") print("正在pdf写书中...") pdfkit.from_file(a,book_name+".pdf",configuration=config,options=options) book_list=spair_kecheng_list("http://www.maiziedu.com/wiki/") for book in book_list: spair_kecheng_wenzhang_list(book[1],book[0])
不出意外我爬取了700M的pdf文档,都是分开的,每个pdf
我的博客即将搬运同步至腾讯云+社区,邀请大家一同入驻:https://cloud.tencent.com/developer/support-plan
评论区