Python爬虫开发详解
Python 爬虫入门
Python 爬虫开发
Python 爬虫工具 BeautifulSoup
文章目录
BeautifulSoup 官方文档
https://beautifulsoup.readthedocs.io/zh-cn/v4.4.0/
https://cloud.tencent.com/developer/article/1193258
https://blog.csdn.net/zcs2312852665/article/details/144804553
参考:
https://blog.51cto.com/haiyongblog/13806452
1. 常用库安装
pip install requests beautifulsoup4 scrapy selenium pandas
2. 基础爬虫开发
2.1. 使用 requests 获取网页内容
import requests
url = 'https://top.baidu.com/board?tab=realtime'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers)
print(response.status_code) # 200表示成功
print(response.text[:500]) # 打印前500个字符

2.2. 使用 BeautifulSoup 解析 HTML
from bs4 import BeautifulSoup
html_doc = """
<html><head><title>测试页面</title></head>
<body>
<p class="title"><b>示例网站</b></p>
<p class="story">这是一个示例页面
<a href="http://example.com/1" class="link" id="link1">链接1</a>
<a href="http://example.com/2" class="link" id="link2">链接2</a>
</p>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# 获取标题
print(soup.title.string)
# 获取所有链接
for link in soup.find_all('a'):
print(link.get('href'), link.string)
# 通过CSS类查找
print(soup.find('p', class_='title').text)
2.2.1. 实战
import requests
from bs4 import BeautifulSoup, element
def url_unsplit(url: str, path: str, params: str="", query: str="", fragment: str=""):
from urllib.parse import urlparse, urlunsplit
parsed_url = urlparse(url)
# 定义URL的各个部分
scheme = parsed_url.scheme
netloc = parsed_url.netloc
# path = parsed_url.path
# params = parsed_url.params
# query = parsed_url.query
# fragment = parsed_url.fragment
# 使用urlunsplit组装URL
new_url = urlunsplit((scheme, netloc, path, query, fragment))
print(new_url, "END") # https://www.example.com/path/to/resource?x=100#anchor END
return new_url
#
def fetch_header_with_headers_and_proxies(url, headers):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
# }
# proxies = {
# 'http': 'http://10.10.1.10:3128',
# 'https': 'https://10.10.1.10:1080',
# }
# response = requests.get(url, headers=headers, proxies=proxies)
response = requests.get(url, headers=headers, )
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.select("a.hot-title")
if isinstance(tags, element.ResultSet):
# print(type(tags), len(tags))
print(tags[0].attrs)
attrs = tags[0].attrs
return attrs.get('href', "")
else:
print("Error", type(tags), len(tags))
return ""
#
def fetch_hot_title_with_headers_and_proxies(url, headers):
response = requests.get(url, headers=headers, )
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.select("a.more_Uf8LD")
if isinstance(tags, element.ResultSet):
# print(type(tags), len(tags))
print(tags[0].attrs)
attrs = tags[0].attrs
return attrs.get('href', "")
else:
print("Error", type(tags), len(tags))
return ""
#
def fetch_realtime_with_headers_and_proxies(url, headers):
response = requests.get(url, headers=headers, )
soup = BeautifulSoup(response.text, 'html.parser')
tags = soup.select("div.category-wrap_iQLoo.horizontal_1eKyQ")
# tags = soup.select("div.content_1YWBm")
# tags = soup.select("div.c-single-text-ellipsis")
if isinstance(tags, element.ResultSet):
for tag in tags:
title_tag = tag.select("div.c-single-text-ellipsis")
print(type(title_tag), title_tag[0].string)
# print(type(tag), tag.select("div.hot-desc_1m_jR.large_nSuFU.ellipsis_DupbZ"))
content_tag = tag.select("div.hot-desc_1m_jR.large_nSuFU")
print(type(content_tag), content_tag[0].a.attrs['href'])
else:
print("Error", type(tags), len(tags))
return ""
# 发送HTTP请求
url = "http://www.baidu.com/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"}
# 1. 访问首页 获取热搜地址
hot_title = fetch_header_with_headers_and_proxies(url, headers)
print(f"1. {hot_title}")
# 2. 访问热搜 获取实时地址
realtime_path = fetch_hot_title_with_headers_and_proxies(hot_title, headers)
print(f"2. {realtime_path}")
# 3. 组装热搜地址,获取热搜 title和连接地址
realtime_new = url_unsplit(hot_title, realtime_path)
print(f"3. {realtime_new}")
realtime_path = fetch_realtime_with_headers_and_proxies(realtime_new, headers)
运行结果:
> python.exe .\main3.py
{'class': ['hot-title'], 'href': 'https://top.baidu.com/board?platform=pc&sa=pcindex_entry', 'target': '_blank'}
1. https://top.baidu.com/board?platform=pc&sa=pcindex_entry
{'class': ['more_Uf8LD'], 'href': '/board?tab=realtime', 'data-click': '{"pos":"i_more_realtime"}'}
2. /board?tab=realtime
https://top.baidu.com/board?tab=realtime END
3. https://top.baidu.com/board?tab=realtime
<class 'bs4.element.ResultSet'> 老百姓盼的 就是我们要干的
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E8%80%81%E7%99%BE%E5%A7%93%E7%9B%BC%E7%9A%84+%E5%B0%B1%E6%98%AF%E6%88%91%E4%BB%AC%E8%A6%81%E5%B9%B2%E7%9A%84&sa=fyb_news&rsv_dl=fyb_news
<class 'bs4.element.ResultSet'> 男友还原女子在三亚被蛇咬身亡过程
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E7%94%B7%E5%8F%8B%E8%BF%98%E5%8E%9F%E5%A5%B3%E5%AD%90%E5%9C%A8%E4%B8%89%E4%BA%9A%E8%A2%AB%E8%9B%87%E5%92%AC%E8%BA%AB%E4%BA%A1%E8%BF%87%E7%A8%8B&sa=fyb_news&rsv_dl=fyb_news
......
<class 'bs4.element.ResultSet'> 外国妈妈拿到“五星卡”向宝宝炫耀
<class 'bs4.element.ResultSet'> https://www.baidu.com/s?wd=%E5%A4%96%E5%9B%BD%E5%A6%88%E5%A6%88%E6%8B%BF%E5%88%B0%E2%80%9C%E4%BA%94%E6%98%9F%E5%8D%A1%E2%80%9D%E5%90%91%E5%AE%9D%E5%AE%9D%E7%82%AB%E8%80%80&sa=fyb_news&rsv_dl=fyb_news
2.3. 处理登录与会话
import requests
login_url = 'https://example.com/login'
target_url = 'https://example.com/dashboard'
session = requests.Session()
# 登录请求
login_data = {
'username': 'your_username',
'password': 'your_password'
}
response = session.post(login_url, data=login_data)
if response.status_code == 200:
# 访问需要登录的页面
dashboard = session.get(target_url)
print(dashboard.text)
else:
print('登录失败')
3. 进阶爬虫开发
3.1. 处理动态加载内容(Selenium)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
# 设置无头浏览器
options = webdriver.ChromeOptions()
options.add_argument('--headless') # 无界面模式
options.add_argument('--disable-gpu')
# 自动下载chromedriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
url = 'https://dynamic-website.com'
driver.get(url)
# 等待元素加载(隐式等待)
driver.implicitly_wait(10)
# 获取动态内容
dynamic_content = driver.find_element(By.CLASS_NAME, 'dynamic-content')
print(dynamic_content.text)
driver.quit()
3.2. 使用Scrapy框架
# 创建Scrapy项目
# scrapy startproject example_project
# cd example_project
# scrapy genspider example example.com
# 示例spider代码
import scrapy
class ExampleSpider(scrapy.Spider):
name = 'example'
allowed_domains = ['example.com']
start_urls = ['http://example.com/']
def parse(self, response):
# 提取数据
title = response.css('title::text').get()
links = response.css('a::attr(href)').getall()
yield {
'title': title,
'links': links
}
# 运行爬虫
# scrapy crawl example -o output.json
3.3. 分布式爬虫(Scrapy-Redis)
# settings.py配置
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
REDIS_URL = 'redis://localhost:6379'
# spider代码
from scrapy_redis.spiders import RedisSpider
class MyDistributedSpider(RedisSpider):
name = 'distributed_spider'
redis_key = 'spider:start_urls'
def parse(self, response):
# 解析逻辑
pass
4. 爬虫优化与反反爬策略
4.1. 常见反爬机制及应对
User-Agent检测 :随机切换User-Agent
IP限制:使用代理IP池
验证码:OCR识别或打码平台
行为分析:模拟人类操作间隔
JavaScript渲染:使用Selenium或Pyppeteer
4.2. 代理IP使用示例
import requests
proxies = {
'http': 'http://proxy_ip:port',
'https': 'https://proxy_ip:port'
}
try:
response = requests.get('https://example.com', proxies=proxies, timeout=5)
print(response.text)
except Exception as e:
print(f'请求失败: {e}')
4.3. 随机延迟与请求头
import random
import time
import requests
from fake_useragent import UserAgent
ua = UserAgent()
def random_delay():
time.sleep(random.uniform(0.5, 2.5))
def get_with_random_headers(url):
headers = {
'User-Agent': ua.random,
'Accept-Language': 'en-US,en;q=0.5',
'Referer': 'https://www.google.com/'
}
random_delay()
return requests.get(url, headers=headers)
作者:cliffordl