Python –爬虫小说学习-仅限于个人娱乐

文章目录

  • 前言
  • 一、演示
  • 二、文件目录示意
  • 三、使用步骤
  • 1.引入库
  • 2.界面控制程序
  • 3.QT业务控制程序
  • 4.批量修改文件名称
  • 总结

  • 前言

    娱乐项目记载:爬取网络上的小说


    一、演示

    二、文件目录示意

    三、使用步骤

    1.引入库

    代码如下(示例):

    import requests
    from lxml import html #调用lxml模块和requests模块
    from pangchong import Worker
    import webbrowser
    import time,os
    from Ui_dowondstory import Ui_MainWindow
    import sys
    from PyQt5.QtGui import QIcon,QDesktopServices  # 用于添加图标
    from PyQt5.QtWidgets import QMainWindow,QApplication
    from PyQt5.QtCore import QUrl
    

    2.界面控制程序

    main_pc.py:主要显示界面,消息发送,启动QT业务线程。

    代码如下:

    #_*_ coding:utf-8 _*_
    
    '''
    #1.获取书名
    #2.获取链接和目录名
    #3.获取内容
    #4.保存内容'''
    
    import requests
    from lxml import html #调用lxml模块和requests模块
    from pangchong import Worker
    import webbrowser
    import time,os
    from Ui_dowondstory import Ui_MainWindow
    import sys
    from PyQt5.QtGui import QIcon,QDesktopServices  # 用于添加图标
    from PyQt5.QtWidgets import QMainWindow,QApplication
    from PyQt5.QtCore import QUrl
    
    
    class LanFei_show_window(QMainWindow,Ui_MainWindow):  # 继承至界面文件的主窗口类
        def __init__(self):
            super().__init__()  # 使用超类,继承父类的属性及方法
            self.setupUi(self)  # 构造窗体界面
            self.setWindowIcon(QIcon("./IMG/icon/icon.jpg"))
            self.setWindowTitle("测试使用")  # 设置窗体主体
            self.initUI()  # 构造功能函数
        
        def initUI(self):
            self.pushButton.clicked.connect(self.openurl)
            self.pushButton_2.clicked.connect(self.dowtext)
            self.lineEdit.setText("https://www.xtyxsw.org/read/280637/")
    
        def click_textbrowser(self):
            self.msg = os.getcwd()
            QDesktopServices.openUrl(QUrl.fromLocalFile(self.msg))
            # self.textBrowser.append("<a href=\"C:/\">{}:{}</a>".format(self.gettime(),"完成下载")) 
            
        def openurl(self):
            #此处添加功能函数
            geturl = self.lineEdit.text()
            print(geturl)
            print("打开网址:{}".format(geturl))
            if geturl != "":
                webbrowser.open(geturl)
            else:
                self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) 
    
        def gettime(self):
            # 获取当前时间
            time_show = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())
            return time_show
    
        # 连接下载按钮
        def dowtext(self):
            geturl = self.lineEdit.text()
            # print(geturl)
            # print("下载数据:{}".format(geturl))
            if self.pushButton_2.text() == "下载":
                if geturl != "":
                    self.test(geturl)
                    self.pushButton_2.setText("停止")
                else:
                    self.textBrowser.append("<font color=\"#FF0000\">{}:请先输入网址路径!</font> ".format(self.gettime())) 
            elif self.pushButton_2.text() == "停止":
                self.worker.change_ret()
           
    
        def test(self,url):
            # url = 'https://www.clewx.com/book/202011/29/11263.html'
            # url = "https://www.xtyxsw.org/read/130638/"  # 神秘世界
            # url = "https://www.xtyxsw.org/read/293323/"  # 开局囤积SSS级卡牌
            
            book_name = self.get_book_url(url)
            print("获取书名:" + book_name)
    
            self.textBrowser.append("{}:".format(self.gettime())+"获取书名--" + book_name)
    
            htmls_list,name_list = self.get_dir(url) #获取链接
            #print(htmls_list)
            #print(name_list)
    
            self.data = [book_name,name_list,htmls_list]
    
            # 创建工作线程的工作对象
            self.worker = Worker(msg=self.data)
            # 连接信号与槽
            self.worker.finished.connect(self.receive)
            self.worker.start()
        
        def get_url(self,url):
            hl = requests.get(url)  # 获取源码
            hl = hl.content.decode("utf-8")
            return hl
    
    
        '''获取书名'''
        def get_book_url(self,url):
            #首先咱们调用模块然后解析这个网页
            selector = html.fromstring(self.get_url(url))
            # shumin = selector.xpath('//div[@class = "con_top"]/h1/text()')
            shumin = selector.xpath('/html/body/div[3]/div[2]/div/span/text()')
    
            # print("获取书名:" + str(shumin[0]))
            return shumin[0]
    
    
        def get_dir(self,url):
            '''获取链接和目录名'''
            htmls_list = []         #创建一个空列表来存储所有章节链接
            names_list = []
            hl = self.get_url(url)
            selector = html.fromstring(hl)
    
            html_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/@href')  #获得链接列表
            name_list = selector.xpath('//div[@class = "link_14"]/dl/dd/a/text()')
            #将链接与网页网址连接,形成每一章的网址
            for i in html_list:
                shuju = str(i)
                htmls_list.append(shuju)
    
            for i in name_list:
                shuju = str(i)
                names_list.append(shuju)
            
            print("每章节链接:" + str(htmls_list) )
            print("每章节目录:" + str(names_list))
            print(len(names_list))
            return htmls_list,names_list
    
    
        def receive(self,text=[]):
            if text[0] == 1:
                self.textBrowser.append("<font color=\"#0000FF\">{}:{}</font> ".format(self.gettime(),text[1]))
            if text[0] == 2:
                self.textBrowser.append("<font color=\"#FF0000\">{}:{}</font> ".format(self.gettime(),text[1]))    
            if text[0] == 3:
                self.textBrowser.setOpenLinks(False)
                self.textBrowser.setOpenExternalLinks(False)
                # self.textBrowser.append("<a href=\"%s\">超链接测试</a>" % ("完成下载"))
                self.textBrowser.append("<a href=\"%s\">{}:{}</a>".format(self.gettime(),text[1]))    
                self.textBrowser.anchorClicked.connect(self.click_textbrowser)  #连接函数
                self.pushButton_2.setText("下载")
    
    
    if __name__ == "__main__":
        app = QApplication(sys.argv)
        ui2 = LanFei_show_window()
        ui2.show()
        sys.exit(app.exec_())
    
    

    Ui_dowondstory.py:pyqt程序

    代码如下:

    # -*- coding: utf-8 -*-
    
    # Form implementation generated from reading ui file 'd:\pythonitem\爬虫小说\dowondstory.ui'
    #
    # Created by: PyQt5 UI code generator 5.15.11
    #
    # WARNING: Any manual changes made to this file will be lost when pyuic5 is
    # run again.  Do not edit this file unless you know what you are doing.
    
    
    from PyQt5 import QtCore, QtGui, QtWidgets
    
    
    class Ui_MainWindow(object):
        def setupUi(self, MainWindow):
            MainWindow.setObjectName("MainWindow")
            MainWindow.resize(579, 368)
            self.centralwidget = QtWidgets.QWidget(MainWindow)
            self.centralwidget.setObjectName("centralwidget")
            self.gridLayout = QtWidgets.QGridLayout(self.centralwidget)
            self.gridLayout.setObjectName("gridLayout")
            self.label = QtWidgets.QLabel(self.centralwidget)
            self.label.setObjectName("label")
            self.gridLayout.addWidget(self.label, 0, 0, 1, 1)
            self.lineEdit = QtWidgets.QLineEdit(self.centralwidget)
            self.lineEdit.setObjectName("lineEdit")
            self.gridLayout.addWidget(self.lineEdit, 0, 1, 1, 1)
            self.pushButton = QtWidgets.QPushButton(self.centralwidget)
            self.pushButton.setObjectName("pushButton")
            self.gridLayout.addWidget(self.pushButton, 0, 2, 1, 1)
            self.pushButton_2 = QtWidgets.QPushButton(self.centralwidget)
            self.pushButton_2.setObjectName("pushButton_2")
            self.gridLayout.addWidget(self.pushButton_2, 0, 3, 1, 1)
            self.textBrowser = QtWidgets.QTextBrowser(self.centralwidget)
            self.textBrowser.setObjectName("textBrowser")
            self.gridLayout.addWidget(self.textBrowser, 1, 0, 1, 4)
            MainWindow.setCentralWidget(self.centralwidget)
            self.menubar = QtWidgets.QMenuBar(MainWindow)
            self.menubar.setGeometry(QtCore.QRect(0, 0, 579, 23))
            self.menubar.setObjectName("menubar")
            MainWindow.setMenuBar(self.menubar)
            self.statusbar = QtWidgets.QStatusBar(MainWindow)
            self.statusbar.setObjectName("statusbar")
            MainWindow.setStatusBar(self.statusbar)
    
            self.retranslateUi(MainWindow)
            QtCore.QMetaObject.connectSlotsByName(MainWindow)
    
        def retranslateUi(self, MainWindow):
            _translate = QtCore.QCoreApplication.translate
            MainWindow.setWindowTitle(_translate("MainWindow", "MainWindow"))
            self.label.setText(_translate("MainWindow", "下载地址:"))
            self.pushButton.setText(_translate("MainWindow", "打开"))
            self.pushButton_2.setText(_translate("MainWindow", "下载"))
    

    3.QT业务控制程序

    pangchong.py:爬取章节小说的业务执行程序

    代码如下:

    import requests
    import os
    from lxml import html #调用lxml模块和requests模块
    import time
    import time
    from PyQt5.QtCore import QThread,pyqtSignal
    import threading
    
    class Worker(QThread):
        finished = pyqtSignal(list)
        
        def __init__(self,msg=None):
            super().__init__()
            self.msg = msg
            self.ret = "True"
    
        def run(self):
            # 在这里执行耗时的操作
            for number in range(0,int(len(self.msg[1]))):
                if self.ret == "break":
                    i = 2
                    self.finished.emit([i,"已停止下载!"])
                    break
                t1 = threading.Thread(target=self.save(self.msg[0],self.msg[1],self.msg[2],number))
                t1.start()
                t1.join()
            i = 3
            self.finished.emit([i,"完成下载!"])
    
    
        def change_ret(self):
            self.ret = "break"
    
        def get_url(self,url):
            hl = requests.get(url)  # 获取源码
            hl = hl.content.decode("utf-8")
            return hl
    
    
        '''获取内容'''
        def get_neirong(self,htmls_list,number):
            url = htmls_list[number]
            # 主网址默认
            url = "https://www.xtyxsw.org" +url
            print("网址:" + url)
            txt = ""
            selector = html.fromstring(self.get_url(url))
    
            txt_list = selector.xpath('//div[@id="content"]/p/text()')
            #print(txt_list)
            liebiao = []
            for i in txt_list:
                i = i[0:]
                #print(i)
                liebiao.append(i)
    
            txts = selector.xpath('//a/text()') # // //div[@id="A3"]/a/text() //*[@id="A3"]  
            # print(txts)
            if "下一页" in txts:
                dizhi = selector.xpath('//a/@href')
                print(dizhi)
                url = "https://www.xtyxsw.org" + dizhi[-4]
                dizhi =  html.fromstring(self.get_url(url))
    
                txt_lists = dizhi.xpath('//div[@id="content"]/p/text()')
                for i in txt_lists:
                    i = i[0:]
                    #print(i)
                    liebiao.append(i)
    
            #print(liebiao)   #打印内容
            return liebiao
    
        '''保存内容'''
        def save(self,book_name,name_list,htmls_list,number):
            path1 = os.getcwd()
            path = path1+"\\" + str(book_name)
            if os.path.isdir(path):                #判断文件夹目录是否存在
                #print(str(path)+":文件夹已经存在!")
                pass
            else:
                os.mkdir(path)
            if number < 0:
                return
    
            liebiao = self.get_neirong(htmls_list,number)
            # print("文本内容:",liebiao)
    
            mulu = str(name_list[int(number)])
            mulu = mulu.replace("?","")
    
            paths = str(path) + "\\" + mulu +".txt"
    
            with open(paths,"w",encoding= "utf-8") as file:
                for wenzhi in liebiao:
                    file.write(wenzhi +"\n")
            print("完成第" + str(int(number)+1) + "章写入!")
            i=1
            h= "完成第" + str(int(number)+1) + "章写入!"
            self.finished.emit([i,h])
            time.sleep(0.5)
    
        def finisheds(self,i,h=None):
            self.finished.emit([i,h])
    
    

    4.批量修改文件名称

    xiugainame.py:将汉数字皆转化为阿拉伯数字

    修改前和修改后的显示图片

    >xiugainame.py:修改文件名称程序

    代码如下:

    import os
    
    
    '''修改文件名称'''
    
    path = "./末日重生:开局囤积SSS级卡牌小说"
    files = os.listdir(path)
    print(files)
    
    
    liebiao1 = ["零","一","二","三","四","五","六","七","八","九"]
    
    liebiao2 = ["十","百","千"]
    
    liebiao3 = ["0","1","2","3","4","5","6","7","8","9"]
    
    for shuju in files:
        new_name = []
    
        for name in shuju:
            jishu=1
            if name in liebiao1:
                print(name)
                print(liebiao1.index(name))
                shuzhi = liebiao1.index(name) 
                changnumber = liebiao3[shuzhi]
                new_name.append(changnumber)
            elif name in liebiao2:
                if shuju[1] == "十" and shuju[2] == "章":
                    new_name.append("10")
                if shuju[1] == "十" and shuju[2] != "章":
                    new_name.append("1")
                if shuju[2] == "十" and shuju[3] == "章":
                    new_name.append("0")
                if shuju[2] == "百" and shuju[3] == "章":
                    new_name.append("00")
                if shuju[2] == "百" and shuju[5] == "章":
                    new_name.append("0")
            else:
                new_name.append(name)
            jishu += 1
           
        print(new_name)
    
        combined_string = ''
        for string in new_name:
            combined_string += string
        print(combined_string)
    
        # 源文件路径
        old_path = path + "/" + shuju
        # 新文件名
        new_name = path + "/" + combined_string
    
        # 修改文件名
        try:
            os.rename(old_path, new_name)
        except FileNotFoundError:
            print("源文件未找到")
        except PermissionError:
            print("权限不足,无法修改文件名")
    

    总结

    娱乐使用,仅供参考,不同的网站可能格式不同,大家自行专研,嘿嘿。

    作者:山中坐

    物联沃分享整理
    物联沃-IOTWORD物联网 » Python –爬虫小说学习-仅限于个人娱乐

    发表回复