暂时先将代码贴上来,后续具体再补充

技术概述

  • python语文基础
  • requests爬虫非常好用的库
  • xpath一种选取内容的语法规则
  • lxml 一种文档解析的库,支持xpath语法

# coding=utf-8
import requests
import os
from lxml import etree
# 纵横中文网 http://book.zongheng.com/showchapter/1020050.html

class Zhongheng:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36"
        }
        self.dirname = '仙人问道'

    def get_html(self, url):
        res = requests.get(url, headers=self.headers)
        if res.status_code == 200:
            return etree.HTML(res.text)
        return None

    def get_chapter_lists(self):
        url = 'http://book.zongheng.com/showchapter/1020050.html'
        html = self.get_html(url)
        # print(etree.tostring(html).decode('utf-8'))
        data = []
        chapters = html.xpath("//ul[contains(@class,'chapter-list')]/li/a")
        for chapter in chapters:
            data.append({
                'title': chapter.xpath('./text()')[0],
                'url': chapter.xpath('./@href')[0]
            })
        return data

    def get_content(self, chapter):
        url = chapter.get('url')
        title = chapter.get('title')
        html = self.get_html(url)
        content = html.xpath("//div[@class='content']//text()")
        content = '\n'.join(content).strip()
        return {'text': content, 'title': title}

    def file_exit(self, title):
        isdir = os.path.isdir(self.dirname)
        filename = self.get_filename(title)
        if not isdir:
            # 如果目录不存在,则创建
            os.mkdir(self.dirname)
        return os.path.exists(filename)

    def get_filename(self, title):
        return self.dirname + '/' + title + '.txt'

    def save(self, content):
        title = content.get('title')
        text = content.get('text')
        filename = self.get_filename(title)
        is_exist = self.file_exit(title)
        if not is_exist:
            # 如果文件不存在
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(text)
                print(title, '保存成功')
        else:
            print(title + '已经保存过了')

    def run(self):
        chapters = self.get_chapter_lists()
        for chapter in chapters:
            content = self.get_content(chapter)
            self.save(content)

if __name__ == '__main__':
    obj = Zhongheng()
    obj.run()
Last modification:August 30th, 2020 at 04:05 am
哎呀呀,大家随意,随意就好!