使用 pdfplumber 进行 PDF 文件版面深度分析与解析提取
pdfplumber 的特点
1、它是一个纯 python 第三方库,适合 python 3.x 版本
2、它用来查看pdf各类信息,能有效提取文本、表格
3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理
import glob
import pdfplumber
import re
from collections import defaultdict
import json
class PDFProcessor:
def __init__(self, filepath):
self.filepath = filepath
#打开文档,注意存放的位置
self.pdf = pdfplumber.open(filepath)
self.all_text = defaultdict(dict)
self.allrow = 0
self.last_num = 0
def check_lines(self, page, top, buttom):
"""
用于检查页面中的行,并根据给定的顶部和底部位置来合并行。
"""
# 文本数据
lines = page.extract_words()[::]
text = ''
last_top = 0
last_check = 0
for l in range(len(lines)):
each_line = lines[l]
check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'
if top == '' and buttom == '':
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
#elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):
elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
elif top == '':
if each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
else:
if each_line['top'] < top and each_line['top'] > buttom:
if abs(last_top - each_line['top']) <= 2:
text = text + each_line['text']
elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,
text):
text = text + each_line['text']
else:
text = text + '\n' + each_line['text']
last_top = each_line['top']
last_check = each_line['x1'] - page.width * 0.85
return text
def drop_empty_cols(self, data):
# 删除所有列为空数据的列
transposed_data = list(map(list, zip(*data)))# 转置数据
filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]# 过滤掉空列
result = list(map(list, zip(*filtered_data)))# 再次转置数据
return result
@staticmethod
def keep_visible_lines(obj):
"""
保留可见的线条。
If the object is a ``rect`` type, keep it only if the lines are visible.
A visible line is the one having ``non_stroking_color`` not null.
"""
if obj['object_type'] == 'rect':
if obj['non_stroking_color'] is None:
return False
if obj['width'] < 1 and obj['height'] < 1:
return False
# return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not None
if obj['object_type'] == 'char':
return obj['stroking_color'] is not None and obj['non_stroking_color'] is not None
return True
def extract_text_and_tables(self, page):
"""
从给定的页面中提取文本和表格。
"""
buttom = 0
page = page.filter(self.keep_visible_lines)
tables = page.find_tables()
if len(tables) >= 1:
# 表格数据
count = len(tables)
for table in tables:
if table.bbox[3] < buttom:
pass
else:
count -= 1
top = table.bbox[1]
text = self.check_lines(page, top, buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
buttom = table.bbox[3]
new_table = table.extract()
r_count = 0
for r in range(len(new_table)):
row = new_table[r]
if row[0] is None:
r_count += 1
for c in range(len(row)):
if row[c] is not None and row[c] not in ['', ' ']:
if new_table[r - r_count][c] is None:
new_table[r - r_count][c] = row[c]
else:
new_table[r - r_count][c] += row[c]
new_table[r][c] = None
else:
r_count = 0
end_table = []
for row in new_table:
if row[0] != None:
cell_list = []
cell_check = False
for cell in row:
if cell != None:
cell = cell.replace('\n', '')
else:
cell = ''
if cell != '':
cell_check = True
cell_list.append(cell)
if cell_check == True:
end_table.append(cell_list)
end_table = self.drop_empty_cols(end_table)
for row in end_table:
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'excel', 'inside': str(row)}
# self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel',
# 'inside': ' '.join(row)}
self.allrow += 1
if count == 0:
text = self.check_lines(page, '', buttom)
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
else:
#文本数据
text = self.check_lines(page, '', '')
text_list = text.split('\n')
for _t in range(len(text_list)):
self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,
'type': 'text', 'inside': text_list[_t]}
self.allrow += 1
# 处理页眉和页脚
first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'
end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'
if self.last_num == 0:
try:
first_text = str(self.all_text[1]['inside'])
end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
if re.search(first_re, first_text) and not '[' in end_text:
self.all_text[1]['type'] = '页眉'
if re.search(end_re, end_text) and not '[' in end_text:
self.all_text[len(self.all_text) - 1]['type'] = '页脚'
except:
print(page.page_number)
else:
try:
first_text = str(self.all_text[self.last_num + 2]['inside'])
end_text = str(self.all_text[len(self.all_text) - 1]['inside'])
if re.search(first_re, first_text) and '[' not in end_text:
self.all_text[self.last_num + 2]['type'] = '页眉'
if re.search(end_re, end_text) and '[' not in end_text:
self.all_text[len(self.all_text) - 1]['type'] = '页脚'
except:
print(page.page_number)
self.last_num = len(self.all_text) - 1
def process_pdf(self):
"""
处理整个PDF文档。
"""
for i in range(len(self.pdf.pages)):
self.extract_text_and_tables(self.pdf.pages[i])
def save_all_text(self, path):
"""
将提取的所有文本保存到指定路径的文件中。
"""
with open(path, 'w', encoding='utf-8') as file:
for key in self.all_text.keys():
file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')
def process_all_pdfs_in_folder(folder_path):
"""
处理指定文件夹下的所有PDF文件。
"""
file_paths = glob.glob(f'{folder_path}/*')
file_paths = sorted(file_paths, reverse=True)
for file_path in file_paths:
print(file_path)
try:
processor = PDFProcessor(file_path)
processor.process_pdf()
save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt')
processor.save_all_text(save_path)
except:
print('check')
if __name__ == '__main__':
# 需要解析的pdf文件路径
pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.pdf'
# pdf解析后的txt内容文件
out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txt'
processor = PDFProcessor(pdf_path)
processor.process_pdf()
processor.save_all_text(out_path)
参考
版面分析–PDF解析神器pdfplumber
版面分析–富文本txt读取
补充
提取PDF中的图片并保存到本地
import pdfplumber
import os
# 定义函数用于提取PDF中的图片并保存
def extract_images_from_pdf(pdf_file, output_folder):
# 创建输出文件夹,如果不存在的话
if not os.path.exists(output_folder):
os.makedirs(output_folder)
with pdfplumber.open(pdf_file) as pdf:
# 遍历每一页
for page_number, page in enumerate(pdf.pages, start=1):
print(f'页码:{page.page_number}')
print(f'页面宽度:{page.width}')
print(f'页面高度:{page.height}')
# 获取该页的所有图片
images = page.images
# 遍历该页的所有图片
for idx, image in enumerate(images, start=1):
# 获取图片的二进制数据
image_data = image['stream'].get_data()
# 构建图片文件名
image_filename = os.path.join(output_folder, f'image_{page_number}_{idx}.png')
# 保存图片到文件
with open(image_filename, 'wb') as f:
f.write(image_data)
print(f'图片已保存至:{image_filename}')
# 示例使用
pdf_file = 'example.pdf'
output_folder = 'extracted_images'
extract_images_from_pdf(pdf_file, output_folder)
提取pdf 表格文本,保存为excel文件
import pdfplumber
from openpyxl import Workbook
# 定义函数用于提取PDF中的表格并保存为Excel文件
def extract_tables_to_excel(pdf_file, excel_output_file):
with pdfplumber.open(pdf_file) as pdf:
workbook = Workbook()
sheet = workbook.active
# 遍历每一页
for page in pdf.pages:
# 提取该页的表格
table = page.extract_table()
# 如果表格存在,则将其写入Excel文件
if table:
for row in table:
sheet.append(row)
# 保存Excel文件
workbook.save(excel_output_file)
# 示例使用
pdf_file = 'example.pdf'
excel_output_file = 'tables.xlsx'
extract_tables_to_excel(pdf_file, excel_output_file)
提取PDF表格 文本
import pdfplumber
# 定义函数用于提取PDF中的表格并保存为文本文件
def extract_tables_to_text(pdf_file, text_output_file):
with pdfplumber.open(pdf_file) as pdf:
with open(text_output_file, 'w', encoding='utf-8') as output:
# 遍历每一页
for page in pdf.pages:
# 提取该页的表格
table = page.extract_table()
# 如果表格存在,则将其写入文本文件
if table:
for row in table:
output.write('\t'.join(str(cell) for cell in row) + '\n')
# 示例使用
pdf_file = 'example.pdf'
text_output_file = 'tables.txt'
extract_tables_to_text(pdf_file, text_output_file)
提取PDF纯文本
import pdfplumber
# 定义函数用于提取PDF中的纯文本并保存为文本文件
def extract_text_to_file(pdf_file, text_output_file):
with pdfplumber.open(pdf_file) as pdf:
with open(text_output_file, 'w', encoding='utf-8') as output:
# 遍历每一页
for page in pdf.pages:
# 提取该页的文本
text = page.extract_text()
# 如果文本存在,则将其写入文本文件
if text:
output.write(text)
# 示例使用
pdf_file = 'example.pdf'
text_output_file = 'text.txt'
extract_text_to_file(pdf_file, text_output_file)
读取富文本txt
python 读取文件函数有三种 read()、readline()、readlines()
# 一次性读取所有文本
with open('story.txt', 'r', encoding='utf-8') as f:
data = f.read()
print(data)
# 读取第一行的内容
with open('story.txt', 'r', encoding='utf-8') as f:
data = f.readline()
print(data)
# 读取全部内容,逐行读取并去除换行符
with open('story.txt', 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.strip('\n')
print(line)
作者:桂花很香,旭很美