python实现发票信息识别和处理

news2026/5/21 4:57:43

公司需要发票报销，一定周期的发票攒在一起，处理报销单特别繁琐，遂萌生用python简化报销流程。

明确需求

公司报销单需要发票代码(短码)，金额，总计金额，如下图
在这里插入图片描述

开始编码

首先需要一个读取pdf的类库
pdfplumber

pip install pdfplumber

这个类库可以把pdf的文件读出来，然后需要正则库re，找到指定字段，再把处理好的数据写入到Excel，需要pandas类库，详细代码如下

def extract_invoice_data(pdf_path):
    # 定义正则表达式模式
    amount_pattern1 = r'¥\s*(\d+\.\d{2})'
    amount_pattern = r'（小写）￥(\d+\.\d{2})'

    with pdfplumber.open(pdf_path) as pdf:
        # 通常情况下，这些信息都在第一页，但可以遍历所有页面以确保正确提取
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                # 匹配发票号码
                invoice_number_pattern = re.compile(r'发票号码\s*[:：]\s*(\d+)')
                invoice_number_match = invoice_number_pattern.search(text)
                invoice_code_match = invoice_number_match.group(1) if invoice_number_match else None
                # 如果找不到发票号码，尝试匹配一串20位或12位的数字
                if not invoice_code_match:
                    backup_invoice_number_pattern = re.compile(r'\b\d{20}\b')
                    backup_invoice_number_match = backup_invoice_number_pattern.search(text)
                    invoice_code_match = backup_invoice_number_match.group(0) if backup_invoice_number_match else None
                if not invoice_code_match:
                    backup_invoice_number_pattern = re.compile(r'\b\d{12}\b')
                    backup_invoice_number_match = backup_invoice_number_pattern.search(text)
                    invoice_code_match = backup_invoice_number_match.group(0) if backup_invoice_number_match else None

                if invoice_code_match is None:
                    print(pdf_path + '::发票代码未找到')
                    continue

                # 匹配金额
                amount_match = re.search(amount_pattern, text)
                if amount_match is None:
                    amount_match = re.search(amount_pattern1, text)
                if amount_match is None:
                    print(pdf_path + '::金额未找到')
                    continue

                if invoice_code_match and amount_match:
                    invoice_code = invoice_code_match
                    amount = amount_match.group(1)
                    return invoice_code, float(amount)  # 注意这里将金额转换为浮点数

    return None, None

def process_pdf_files(directory_path):
    # 获取目录中的所有文件
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]

    data = []

    # 更新进度条的最大值
    progress_bar["maximum"] = len(files)

    # 遍历文件列表
    for i, file_name in enumerate(files):
        pdf_path = os.path.join(directory_path, file_name)
        invoice_code, amount = extract_invoice_data(pdf_path)
        if invoice_code and amount:
            data.append({
                '文件名': file_name,
                '发票号码': invoice_code,
                '金额合计': amount
            })
        else:
            print(f"未从文件 {file_name} 中找到有效的发票号码或金额")

        # 更新进度条
        progress_bar["value"] = i + 1
        root.update_idletasks()

    # 将数据转换为DataFrame
    df = pd.DataFrame(data)

    # 计算金额合计的总和
    total_amount = df['金额合计'].sum()

    # 创建一个包含总计行的新DataFrame
    summary_row = pd.DataFrame({
        '文件名': ['总计'],
        '发票号码': [''],
        '金额合计': [total_amount]
    })

    # 使用concat函数合并原始DataFrame和总计行
    df = pd.concat([df, summary_row], ignore_index=True)

    # 写入Excel文件
    output_file = os.path.join(directory_path, 'output.xlsx')
    df.to_excel(output_file, index=False, engine='openpyxl')

    # 提示完成
    messagebox.showinfo("完成", f"处理完成！文件已保存为: {output_file}")

为了使用起来更方便，我直接打包成exe,并加入了输入框和进度条