当前位置：物联沃-IOTWORD物联网 > 技术教程 > Python爬虫开发详解

代码收藏家技术教程 2025-06-20

Python爬虫开发详解

Python 爬虫入门
Python 爬虫开发
Python 爬虫工具 BeautifulSoup

文章目录

1. 常用库安装

2. 基础爬虫开发

2.1. 使用 requests 获取网页内容

2.2. 使用 BeautifulSoup 解析 HTML

2.2.1. 实战

2.3. 处理登录与会话

3. 进阶爬虫开发

3.1. 处理动态加载内容（Selenium）

3.2. 使用Scrapy框架

3.3. 分布式爬虫（Scrapy-Redis）

4. 爬虫优化与反反爬策略

4.1. 常见反爬机制及应对

4.2. 代理IP使用示例

4.3. 随机延迟与请求头

BeautifulSoup 官方文档

https://beautifulsoup.readthedocs.io/zh-cn/v4.4.0/

https://cloud.tencent.com/developer/article/1193258

https://blog.csdn.net/zcs2312852665/article/details/144804553

参考：
https://blog.51cto.com/haiyongblog/13806452

1. 常用库安装

pip install requests beautifulsoup4 scrapy selenium pandas

2. 基础爬虫开发

2.1. 使用 requests 获取网页内容

import requests

url = 'https://top.baidu.com/board?tab=realtime'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

response = requests.get(url, headers=headers)
print(response.status_code)     # 200表示成功
print(response.text[:500])      # 打印前500个字符

2.2. 使用 BeautifulSoup 解析 HTML

from bs4 import BeautifulSoup

html_doc = """
            <html><head><title>测试页面</title></head>
            <body>
            <p class="title"><b>示例网站</b></p>
            <p class="story">这是一个示例页面
            <a href="http://example.com/1" class="link" id="link1">链接1</a>
            <a href="http://example.com/2" class="link" id="link2">链接2</a>
            </p>
            """

soup = BeautifulSoup(html_doc, 'html.parser')

# 获取标题
print(soup.title.string)

# 获取所有链接
for link in soup.find_all('a'):
    print(link.get('href'), link.string)

# 通过CSS类查找
print(soup.find('p', class_='title').text)

2.2.1. 实战

import requests
from bs4 import BeautifulSoup, element

def url_unsplit(url: str, path: str, params: str="", query: str="", fragment: str=""):
    from urllib.parse import urlparse, urlunsplit
    parsed_url = urlparse(url)

    # 定义URL的各个部分
    scheme      = parsed_url.scheme
    netloc      = parsed_url.netloc
    # path        = parsed_url.path
    # params      = parsed_url.params
    # query       = parsed_url.query
    # fragment    = parsed_url.fragment
 
    # 使用urlunsplit组装URL
    new_url = urlunsplit((scheme, netloc, path, query, fragment))
    print(new_url, "END")  # https://www.example.com/path/to/resource?x=100#anchor END
    return new_url

# 
def fetch_header_with_headers_and_proxies(url, headers):
    # headers = {
    #     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    # }
    # proxies = {
    #    'http': 'http://10.10.1.10:3128',
    #    'https': 'https://10.10.1.10:1080',
    # }
    # response = requests.get(url, headers=headers, proxies=proxies)

    response = requests.get(url, headers=headers, )
    soup = BeautifulSoup(response.text, 'html.parser')
    tags = soup.select("a.hot-title")
    if isinstance(tags, element.ResultSet):
        # print(type(tags), len(tags))
        print(tags[0].attrs)
        attrs = tags[0].attrs
        return attrs.get('href', "")
    else:
        print("Error", type(tags), len(tags))
    return ""

# 
def fetch_hot_title_with_headers_and_proxies(url, headers):
    response = requests.get(url, headers=headers, )
    soup = BeautifulSoup(response.text, 'html.parser')

    tags = soup.select("a.more_Uf8LD")
    if isinstance(tags, element.ResultSet):
        # print(type(tags), len(tags))
        print(tags[0].attrs)
        attrs = tags[0].attrs
        return attrs.get('href', "")
    else:
        print("Error", type(tags), len(tags))
    return ""

# 
def fetch_realtime_with_headers_and_proxies(url, headers):
    response = requests.get(url, headers=headers, )
    soup = BeautifulSoup(response.text, 'html.parser')

    tags = soup.select("div.category-wrap_iQLoo.horizontal_1eKyQ")
    # tags = soup.select("div.content_1YWBm")
    # tags = soup.select("div.c-single-text-ellipsis")
    if isinstance(tags, element.ResultSet):
        for tag in tags:
            title_tag = tag.select("div.c-single-text-ellipsis")
            print(type(title_tag), title_tag[0].string)
            # print(type(tag), tag.select("div.hot-desc_1m_jR.large_nSuFU.ellipsis_DupbZ"))
            
            content_tag = tag.select("div.hot-desc_1m_jR.large_nSuFU")
            print(type(content_tag), content_tag[0].a.attrs['href'])
    else:
        print("Error", type(tags), len(tags))
    return ""


# 发送HTTP请求
url = "http://www.baidu.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}

# 1. 访问首页 获取热搜地址
hot_title = fetch_header_with_headers_and_proxies(url, headers)
print(f"1. {hot_title}")

# 2. 访问热搜 获取实时地址
realtime_path = fetch_hot_title_with_headers_and_proxies(hot_title, headers)
print(f"2. {realtime_path}")

# 3. 组装热搜地址，获取热搜 title和连接地址
realtime_new = url_unsplit(hot_title, realtime_path)
print(f"3. {realtime_new}")
realtime_path = fetch_realtime_with_headers_and_proxies(realtime_new, headers)

运行结果：

> python.exe .\main3.py
{'class': ['hot-title'], 'href': 'https://top.baidu.com/board?platform=pc&sa=pcindex_entry', 'target': '_blank'}
1. https://top.baidu.com/board?platform=pc&sa=pcindex_entry
{'class': ['more_Uf8LD'], 'href': '/board?tab=realtime', 'data-click': '{"pos":"i_more_realtime"}'}
2. /board?tab=realtime
https://top.baidu.com/board?tab=realtime END
3. https://top.baidu.com/board?tab=realtime
<class 'bs4.element.ResultSet'>   老百姓盼的 就是我们要干的 
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E8%80%81%E7%99%BE%E5%A7%93%E7%9B%BC%E7%9A%84+%E5%B0%B1%E6%98%AF%E6%88%91%E4%BB%AC%E8%A6%81%E5%B9%B2%E7%9A%84&sa=fyb_news&rsv_dl=fyb_news
<class 'bs4.element.ResultSet'>   男友还原女子在三亚被蛇咬身亡过程 
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E7%94%B7%E5%8F%8B%E8%BF%98%E5%8E%9F%E5%A5%B3%E5%AD%90%E5%9C%A8%E4%B8%89%E4%BA%9A%E8%A2%AB%E8%9B%87%E5%92%AC%E8%BA%AB%E4%BA%A1%E8%BF%87%E7%A8%8B&sa=fyb_news&rsv_dl=fyb_news
......
<class 'bs4.element.ResultSet'>   外国妈妈拿到“五星卡”向宝宝炫耀 
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E5%A4%96%E5%9B%BD%E5%A6%88%E5%A6%88%E6%8B%BF%E5%88%B0%E2%80%9C%E4%BA%94%E6%98%9F%E5%8D%A1%E2%80%9D%E5%90%91%E5%AE%9D%E5%AE%9D%E7%82%AB%E8%80%80&sa=fyb_news&rsv_dl=fyb_news

2.3. 处理登录与会话

import requests

login_url = 'https://example.com/login'
target_url = 'https://example.com/dashboard'

session = requests.Session()

# 登录请求
login_data = {
    'username': 'your_username',
    'password': 'your_password'
}

response = session.post(login_url, data=login_data)

if response.status_code == 200:
    # 访问需要登录的页面
    dashboard = session.get(target_url)
    print(dashboard.text)
else:
    print('登录失败')

3. 进阶爬虫开发

3.1. 处理动态加载内容（Selenium）

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# 设置无头浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless')  # 无界面模式
options.add_argument('--disable-gpu')

# 自动下载chromedriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url = 'https://dynamic-website.com'
driver.get(url)

# 等待元素加载（隐式等待）
driver.implicitly_wait(10)

# 获取动态内容
dynamic_content = driver.find_element(By.CLASS_NAME, 'dynamic-content')
print(dynamic_content.text)

driver.quit()

3.2. 使用Scrapy框架

# 创建Scrapy项目
# scrapy startproject example_project
# cd example_project
# scrapy genspider example example.com

# 示例spider代码
import scrapy

class ExampleSpider(scrapy.Spider):
    name = 'example'
    allowed_domains = ['example.com']
    start_urls = ['http://example.com/']

    def parse(self, response):
        # 提取数据
        title = response.css('title::text').get()
        links = response.css('a::attr(href)').getall()
        
        yield {
            'title': title,
            'links': links
        }

# 运行爬虫
# scrapy crawl example -o output.json

3.3. 分布式爬虫（Scrapy-Redis）

# settings.py配置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'

# spider代码
from scrapy_redis.spiders import RedisSpider

class MyDistributedSpider(RedisSpider):
    name = 'distributed_spider'
    redis_key = 'spider:start_urls'

    def parse(self, response):
        # 解析逻辑
        pass

4. 爬虫优化与反反爬策略

4.1. 常见反爬机制及应对

User-Agent检测：随机切换User-Agent
IP限制：使用代理IP池
验证码：OCR识别或打码平台
行为分析：模拟人类操作间隔
JavaScript渲染：使用Selenium或Pyppeteer

4.2. 代理IP使用示例

import requests

proxies = {
    'http': 'http://proxy_ip:port',
    'https': 'https://proxy_ip:port'
}

try:
    response = requests.get('https://example.com', proxies=proxies, timeout=5)
    print(response.text)
except Exception as e:
    print(f'请求失败: {e}')

4.3. 随机延迟与请求头

import random
import time
import requests
from fake_useragent import UserAgent

ua = UserAgent()

def random_delay():
    time.sleep(random.uniform(0.5, 2.5))

def get_with_random_headers(url):
    headers = {
        'User-Agent': ua.random,
        'Accept-Language': 'en-US,en;q=0.5',
        'Referer': 'https://www.google.com/'
    }
    random_delay()
    return requests.get(url, headers=headers)

作者：cliffordl

物联沃分享整理
物联沃-IOTWORD物联网 » Python爬虫开发详解

代码收藏家普通

分享到：

发表回复取消回复