Python实现PDF文件中文本、表格、图片的读取

python读取PDF文件中文本、表格、图片

提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档

文章目录

  • 一、文本读取
  • 二、图片读取
  • 三、表格读取

  • 一、文本读取

    基于fitz

    import fitz
    pdf_file = "example.pdf"
    pdf_document = fitz.open(pdf_file)
    text = ""
    for page_number in range(len(pdf_document)):
        page = pdf_document.load_page(page_number)
        for block in page.get_text("blocks"):
            x0, y0, x1, y1 = block[0:4]
            text_block = block[4]
            # 根据文本块属性过滤表格中的文本
            # 这只是一个示例,你可以根据文本块的位置和其他属性来进一步过滤
            if y1 - y0 < 20:  # 通过高度过滤小文本块
                continue
            if "image" in text_block:
                continue
            text += text_block
    pdf_document.close()
    print(text)
    

    二、图片读取

    基于fitz

    import fitz
    doc = fitz.open("example.pdf") # open a document
    for page_index in range(len(doc)): # iterate over pdf pages
        page = doc[page_index] # get the page
        image_list = page.get_images()
        # print the number of images found on the page
        if image_list:
            print(f"Found {len(image_list)} images on page {page_index}")
        else:
            print("No images found on page", page_index)
        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            xref = img[0] # get the XREF of the image
            pix = fitz.Pixmap(doc, xref) # create a Pixmap
            if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                pix = fitz.Pixmap(fitz.csRGB, pix)
            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
            pix = None
    

    三、表格读取

    基于fitz

    import fitz
    doc = fitz.open("example.pdf") # open a document
    for page_index in range(len(doc)): # iterate over pdf pages
        page = doc[page_index] # get the page
        image_list = page.get_images()
        # print the number of images found on the page
        if image_list:
            print(f"Found {len(image_list)} images on page {page_index}")
        else:
            print("No images found on page", page_index)
        for image_index, img in enumerate(image_list, start=1): # enumerate the image list
            xref = img[0] # get the XREF of the image
            pix = fitz.Pixmap(doc, xref) # create a Pixmap
            if pix.n - pix.alpha > 3: # CMYK: convert to RGB first
                pix = fitz.Pixmap(fitz.csRGB, pix)
            pix.save("page_%s-image_%s.png" % (page_index, image_index)) # save the image as png
            pix = None
    

    基于fitz,将表格数据当作文本内容抽取

    import fitz
    doc = fitz.open("example.pdf") # open a document
    out = open("output.txt", "wb") # create a text output
    for page in doc: # iterate the document pages
        text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
        out.write(text) # write text of page
        out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
    out.close()
    

    基于pdfplumber

    import pdfplumber
    import pandas as pd
    # 读取pdf文件,保存为pdf实例
    pdf =  pdfplumber.open("example.pdf") 
    # 访问第二页
    first_page = pdf.pages[1]
    # 自动读取表格信息,返回列表
    tables = first_page.extract_tables(table_settings = {})
    for table in tables:
        table = pd.DataFrame(table[1:], columns=table[0])
        print(table)
    
    物联沃分享整理
    物联沃-IOTWORD物联网 » Python实现PDF文件中文本、表格、图片的读取

    发表评论