Python 爬虫入门
Python 爬虫开发
Python 爬虫工具 BeautifulSoup

文章目录

  • 1. 常用库安装
  • 2. 基础爬虫开发
  • 2.1. 使用 requests 获取网页内容
  • 2.2. 使用 BeautifulSoup 解析 HTML
  • 2.2.1. 实战
  • 2.3. 处理登录与会话
  • 3. 进阶爬虫开发
  • 3.1. 处理动态加载内容(Selenium)
  • 3.2. 使用Scrapy框架
  • 3.3. 分布式爬虫(Scrapy-Redis)
  • 4. 爬虫优化与反反爬策略
  • 4.1. 常见反爬机制及应对
  • 4.2. 代理IP使用示例
  • 4.3. 随机延迟与请求头
  • BeautifulSoup 官方文档

    https://beautifulsoup.readthedocs.io/zh-cn/v4.4.0/

    https://cloud.tencent.com/developer/article/1193258

    https://blog.csdn.net/zcs2312852665/article/details/144804553

    参考:
    https://blog.51cto.com/haiyongblog/13806452

    1. 常用库安装

    pip install requests beautifulsoup4 scrapy selenium pandas
    

    2. 基础爬虫开发

    2.1. 使用 requests 获取网页内容

    import requests
    
    url = 'https://top.baidu.com/board?tab=realtime'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    }
    
    response = requests.get(url, headers=headers)
    print(response.status_code)     # 200表示成功
    print(response.text[:500])      # 打印前500个字符
    

    2.2. 使用 BeautifulSoup 解析 HTML

    from bs4 import BeautifulSoup
    
    html_doc = """
                <html><head><title>测试页面</title></head>
                <body>
                <p class="title"><b>示例网站</b></p>
                <p class="story">这是一个示例页面
                <a href="http://example.com/1" class="link" id="link1">链接1</a>
                <a href="http://example.com/2" class="link" id="link2">链接2</a>
                </p>
                """
    
    soup = BeautifulSoup(html_doc, 'html.parser')
    
    # 获取标题
    print(soup.title.string)
    
    # 获取所有链接
    for link in soup.find_all('a'):
        print(link.get('href'), link.string)
    
    # 通过CSS类查找
    print(soup.find('p', class_='title').text)
    

    2.2.1. 实战

    import requests
    from bs4 import BeautifulSoup, element
    
    def url_unsplit(url: str, path: str, params: str="", query: str="", fragment: str=""):
        from urllib.parse import urlparse, urlunsplit
        parsed_url = urlparse(url)
    
        # 定义URL的各个部分
        scheme      = parsed_url.scheme
        netloc      = parsed_url.netloc
        # path        = parsed_url.path
        # params      = parsed_url.params
        # query       = parsed_url.query
        # fragment    = parsed_url.fragment
     
        # 使用urlunsplit组装URL
        new_url = urlunsplit((scheme, netloc, path, query, fragment))
        print(new_url, "END")  # https://www.example.com/path/to/resource?x=100#anchor END
        return new_url
    
    # 
    def fetch_header_with_headers_and_proxies(url, headers):
        # headers = {
        #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
        # }
        # proxies = {
        #    'http': 'http://10.10.1.10:3128',
        #    'https': 'https://10.10.1.10:1080',
        # }
        # response = requests.get(url, headers=headers, proxies=proxies)
    
        response = requests.get(url, headers=headers, )
        soup = BeautifulSoup(response.text, 'html.parser')
        tags = soup.select("a.hot-title")
        if isinstance(tags, element.ResultSet):
            # print(type(tags), len(tags))
            print(tags[0].attrs)
            attrs = tags[0].attrs
            return attrs.get('href', "")
        else:
            print("Error", type(tags), len(tags))
        return ""
    
    # 
    def fetch_hot_title_with_headers_and_proxies(url, headers):
        response = requests.get(url, headers=headers, )
        soup = BeautifulSoup(response.text, 'html.parser')
    
        tags = soup.select("a.more_Uf8LD")
        if isinstance(tags, element.ResultSet):
            # print(type(tags), len(tags))
            print(tags[0].attrs)
            attrs = tags[0].attrs
            return attrs.get('href', "")
        else:
            print("Error", type(tags), len(tags))
        return ""
    
    # 
    def fetch_realtime_with_headers_and_proxies(url, headers):
        response = requests.get(url, headers=headers, )
        soup = BeautifulSoup(response.text, 'html.parser')
    
        tags = soup.select("div.category-wrap_iQLoo.horizontal_1eKyQ")
        # tags = soup.select("div.content_1YWBm")
        # tags = soup.select("div.c-single-text-ellipsis")
        if isinstance(tags, element.ResultSet):
            for tag in tags:
                title_tag = tag.select("div.c-single-text-ellipsis")
                print(type(title_tag), title_tag[0].string)
                # print(type(tag), tag.select("div.hot-desc_1m_jR.large_nSuFU.ellipsis_DupbZ"))
                
                content_tag = tag.select("div.hot-desc_1m_jR.large_nSuFU")
                print(type(content_tag), content_tag[0].a.attrs['href'])
        else:
            print("Error", type(tags), len(tags))
        return ""
    
    
    # 发送HTTP请求
    url = "http://www.baidu.com/"
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
    
    # 1. 访问首页 获取热搜地址
    hot_title = fetch_header_with_headers_and_proxies(url, headers)
    print(f"1. {hot_title}")
    
    # 2. 访问热搜 获取实时地址
    realtime_path = fetch_hot_title_with_headers_and_proxies(hot_title, headers)
    print(f"2. {realtime_path}")
    
    # 3. 组装热搜地址,获取热搜 title和连接地址
    realtime_new = url_unsplit(hot_title, realtime_path)
    print(f"3. {realtime_new}")
    realtime_path = fetch_realtime_with_headers_and_proxies(realtime_new, headers)
    

    运行结果:

    > python.exe .\main3.py
    {'class': ['hot-title'], 'href': 'https://top.baidu.com/board?platform=pc&sa=pcindex_entry', 'target': '_blank'}
    1. https://top.baidu.com/board?platform=pc&sa=pcindex_entry
    {'class': ['more_Uf8LD'], 'href': '/board?tab=realtime', 'data-click': '{"pos":"i_more_realtime"}'}
    2. /board?tab=realtime
    https://top.baidu.com/board?tab=realtime END
    3. https://top.baidu.com/board?tab=realtime
    <class 'bs4.element.ResultSet'>   老百姓盼的 就是我们要干的 
    <class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E8%80%81%E7%99%BE%E5%A7%93%E7%9B%BC%E7%9A%84+%E5%B0%B1%E6%98%AF%E6%88%91%E4%BB%AC%E8%A6%81%E5%B9%B2%E7%9A%84&sa=fyb_news&rsv_dl=fyb_news
    <class 'bs4.element.ResultSet'>   男友还原女子在三亚被蛇咬身亡过程 
    <class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E7%94%B7%E5%8F%8B%E8%BF%98%E5%8E%9F%E5%A5%B3%E5%AD%90%E5%9C%A8%E4%B8%89%E4%BA%9A%E8%A2%AB%E8%9B%87%E5%92%AC%E8%BA%AB%E4%BA%A1%E8%BF%87%E7%A8%8B&sa=fyb_news&rsv_dl=fyb_news
    ......
    <class 'bs4.element.ResultSet'>   外国妈妈拿到“五星卡”向宝宝炫耀 
    <class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E5%A4%96%E5%9B%BD%E5%A6%88%E5%A6%88%E6%8B%BF%E5%88%B0%E2%80%9C%E4%BA%94%E6%98%9F%E5%8D%A1%E2%80%9D%E5%90%91%E5%AE%9D%E5%AE%9D%E7%82%AB%E8%80%80&sa=fyb_news&rsv_dl=fyb_news
    

    2.3. 处理登录与会话

    import requests
    
    login_url = 'https://example.com/login'
    target_url = 'https://example.com/dashboard'
    
    session = requests.Session()
    
    # 登录请求
    login_data = {
        'username': 'your_username',
        'password': 'your_password'
    }
    
    response = session.post(login_url, data=login_data)
    
    if response.status_code == 200:
        # 访问需要登录的页面
        dashboard = session.get(target_url)
        print(dashboard.text)
    else:
        print('登录失败')
    

    3. 进阶爬虫开发

    3.1. 处理动态加载内容(Selenium)

    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from webdriver_manager.chrome import ChromeDriverManager
    
    # 设置无头浏览器
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # 无界面模式
    options.add_argument('--disable-gpu')
    
    # 自动下载chromedriver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    url = 'https://dynamic-website.com'
    driver.get(url)
    
    # 等待元素加载(隐式等待)
    driver.implicitly_wait(10)
    
    # 获取动态内容
    dynamic_content = driver.find_element(By.CLASS_NAME, 'dynamic-content')
    print(dynamic_content.text)
    
    driver.quit()
    

    3.2. 使用Scrapy框架

    # 创建Scrapy项目
    # scrapy startproject example_project
    # cd example_project
    # scrapy genspider example example.com
    
    # 示例spider代码
    import scrapy
    
    class ExampleSpider(scrapy.Spider):
        name = 'example'
        allowed_domains = ['example.com']
        start_urls = ['http://example.com/']
    
        def parse(self, response):
            # 提取数据
            title = response.css('title::text').get()
            links = response.css('a::attr(href)').getall()
            
            yield {
                'title': title,
                'links': links
            }
    
    # 运行爬虫
    # scrapy crawl example -o output.json
    

    3.3. 分布式爬虫(Scrapy-Redis)

    # settings.py配置
    SCHEDULER = "scrapy_redis.scheduler.Scheduler"
    DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
    REDIS_URL = 'redis://localhost:6379'
    
    # spider代码
    from scrapy_redis.spiders import RedisSpider
    
    class MyDistributedSpider(RedisSpider):
        name = 'distributed_spider'
        redis_key = 'spider:start_urls'
    
        def parse(self, response):
            # 解析逻辑
            pass
    

    4. 爬虫优化与反反爬策略

    4.1. 常见反爬机制及应对

    User-Agent检测 :随机切换User-Agent
    IP限制:使用代理IP池
    验证码:OCR识别或打码平台
    行为分析:模拟人类操作间隔
    JavaScript渲染:使用Selenium或Pyppeteer

    4.2. 代理IP使用示例

    import requests
    
    proxies = {
        'http': 'http://proxy_ip:port',
        'https': 'https://proxy_ip:port'
    }
    
    try:
        response = requests.get('https://example.com', proxies=proxies, timeout=5)
        print(response.text)
    except Exception as e:
        print(f'请求失败: {e}')
    

    4.3. 随机延迟与请求头

    import random
    import time
    import requests
    from fake_useragent import UserAgent
    
    ua = UserAgent()
    
    def random_delay():
        time.sleep(random.uniform(0.5, 2.5))
    
    def get_with_random_headers(url):
        headers = {
            'User-Agent': ua.random,
            'Accept-Language': 'en-US,en;q=0.5',
            'Referer': 'https://www.google.com/'
        }
        random_delay()
        return requests.get(url, headers=headers)
    

    作者:cliffordl

    物联沃分享整理
    物联沃-IOTWORD物联网 » Python爬虫开发详解

    发表回复