代码收藏家技术教程 2023-06-17

Python爬虫实战：使用爬虫获取天气预报数据

1. 项目简介

在中国天气网(http://www.weather.com.cn)中输入一个城市的名称，例如输入深圳，那么会转到地址http://www.weather.com.cn/weather1d/101280601.shtml的网页显示深圳的天气预报，其中101280601是深圳的代码，每个城市或者地区都有一个代码。如下图：

在上图中可以看到，深圳今天，7天，8-15天等的天气数据，这里爬取7天的天气预报数据。

2. HTML 代码分析

分析这段代码：

7天的天气预报实际上在一个<ul class="t clearfix">元素中，每天是一个M<li>元素，7天的结构差不多是一样的(注意：今天没有最高温度与最低温度)。

3. 爬取天气预报数据

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit  # BS内置库，用于推测文档编码
import urllib.request  # 发起请求，获取响应

url = "http://www.weather.com.cn/weather/101280601.shtml"

try:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 Edg/109.0.1518.78"}
    req = urllib.request.Request(url, headers=headers)  # 创建请求对象
    data = urllib.request.urlopen(req)  # 发起请求
    data = data.read()  # 获得响应体
    dammit = UnicodeDammit(data, ["utf-8", "gbk"])
    data = dammit.unicode_markup  # 解码
    soup = BeautifulSoup(data, "lxml")
    lis = soup.select("ul[class='t clearfix'] li")
    x = 0
    for li in lis:
        try:
            date = li.select('h1')[0].text
            weather = li.select('p[class="wea"]')[0].text
            if x == 0:  # 为今天只有一个温度做判断 <i>14℃</i>
                x += 1
                temp = li.select('p[class="tem"] i')[0].text
            else:
                temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
            print(date, weather, temp)
            # 22日（今天） 晴 14℃
            # 23日（明天） 晴 23℃/14℃
            # 24日（后天） 晴转多云 25℃/13℃
            # 25日（周六） 多云 21℃/13℃
            # 26日（周日） 多云转晴 22℃/12℃
            # 27日（周一） 晴 21℃/12℃
            # 28日（周二） 晴 24℃/14℃
        except Exception as err:
            print(err)
except Exception as err:
    print(err)

4. 爬取与存储天气预报数据

获取北京、上海、广州、深圳等城市的代码，爬取这些城市的天气预报数据，并存储到sqllite数据库weathers.db中，存储的数据表weathers是：

create table weathers (wCity varchar(16),wDate varchar(16),wWeather varchar(64),wTemp varchar(32),constraint pk_weather primary key (wCity,wDate))"

编写程序依次爬取各个城市的天气预报数据存储在数据库中，程序如下：

from bs4 import BeautifulSoup
from bs4.dammit import UnicodeDammit
import urllib.request
import sqlite3


# 天气数据库
class WeatherDB:
    def __init__(self):
        self.cursor = None
        self.con = None

    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute(
                "create table weathers (wCity varchar(16),"
                "wDate varchar(16),"
                "wWeather varchar(64),"
                "wTemp varchar(32),"
                "constraint pk_weather primary key (wCity,wDate))")
        except Exception as err:
            print(err)
            self.cursor.execute("delete from weathers")

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("insert into weathers (wCity,wDate,wWeather,wTemp) values (?,?,?,?)",
                                (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("select * from weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("city", "date", "weather", "temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


# 天气预报
class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) "
                          "Gecko/2008072421 Minefield/3.0.2pre"}
        self.cityCode = {"北京": "101010100", "上海": "101020100", "广州": "101280101", "深圳": "101280601"}

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " 找不到代码")
            return
        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            x = 0
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    if x == 0:  # 为今天只有一个温度做判断 <i>14℃</i>
                        x += 1
                        temp = li.select('p[class="tem"] i')[0].text
                    else:
                        temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
                    print(city, date, weather, temp)
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(err)
        except Exception as err:
            print(err)

    def process(self, cities):
        self.db = WeatherDB()
        self.db.openDB()

        for city in cities:
            self.forecastCity(city)

        # self.db.show()
        self.db.closeDB()


ws = WeatherForecast()
ws.process(["北京", "上海", "广州", "深圳"])
print("completed")

程序执行结果如下：