公司需要发票报销,一定周期的发票攒在一起,处理报销单特别繁琐,遂萌生用python简化报销流程。
明确需求
公司报销单需要发票代码(短码),金额,总计金额,如下图
 
开始编码
首先需要一个读取pdf的类库
 pdfplumber
pip install pdfplumber
这个类库可以把pdf的文件读出来,然后需要正则库re,找到指定字段,再把处理好的数据写入到Excel,需要pandas类库,详细代码如下
def extract_invoice_data(pdf_path):
    # 定义正则表达式模式
    amount_pattern1 = r'¥\s*(\d+\.\d{2})'
    amount_pattern = r'(小写)¥(\d+\.\d{2})'
    with pdfplumber.open(pdf_path) as pdf:
        # 通常情况下,这些信息都在第一页,但可以遍历所有页面以确保正确提取
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                # 匹配发票号码
                invoice_number_pattern = re.compile(r'发票号码\s*[::]\s*(\d+)')
                invoice_number_match = invoice_number_pattern.search(text)
                invoice_code_match = invoice_number_match.group(1) if invoice_number_match else None
                # 如果找不到发票号码,尝试匹配一串20位或12位的数字
                if not invoice_code_match:
                    backup_invoice_number_pattern = re.compile(r'\b\d{20}\b')
                    backup_invoice_number_match = backup_invoice_number_pattern.search(text)
                    invoice_code_match = backup_invoice_number_match.group(0) if backup_invoice_number_match else None
                if not invoice_code_match:
                    backup_invoice_number_pattern = re.compile(r'\b\d{12}\b')
                    backup_invoice_number_match = backup_invoice_number_pattern.search(text)
                    invoice_code_match = backup_invoice_number_match.group(0) if backup_invoice_number_match else None
                if invoice_code_match is None:
                    print(pdf_path + '::发票代码未找到')
                    continue
                # 匹配金额
                amount_match = re.search(amount_pattern, text)
                if amount_match is None:
                    amount_match = re.search(amount_pattern1, text)
                if amount_match is None:
                    print(pdf_path + '::金额未找到')
                    continue
                if invoice_code_match and amount_match:
                    invoice_code = invoice_code_match
                    amount = amount_match.group(1)
                    return invoice_code, float(amount)  # 注意这里将金额转换为浮点数
    return None, None
def process_pdf_files(directory_path):
    # 获取目录中的所有文件
    files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    data = []
    # 更新进度条的最大值
    progress_bar["maximum"] = len(files)
    # 遍历文件列表
    for i, file_name in enumerate(files):
        pdf_path = os.path.join(directory_path, file_name)
        invoice_code, amount = extract_invoice_data(pdf_path)
        if invoice_code and amount:
            data.append({
                '文件名': file_name,
                '发票号码': invoice_code,
                '金额合计': amount
            })
        else:
            print(f"未从文件 {file_name} 中找到有效的发票号码或金额")
        # 更新进度条
        progress_bar["value"] = i + 1
        root.update_idletasks()
    # 将数据转换为DataFrame
    df = pd.DataFrame(data)
    # 计算金额合计的总和
    total_amount = df['金额合计'].sum()
    # 创建一个包含总计行的新DataFrame
    summary_row = pd.DataFrame({
        '文件名': ['总计'],
        '发票号码': [''],
        '金额合计': [total_amount]
    })
    # 使用concat函数合并原始DataFrame和总计行
    df = pd.concat([df, summary_row], ignore_index=True)
    # 写入Excel文件
    output_file = os.path.join(directory_path, 'output.xlsx')
    df.to_excel(output_file, index=False, engine='openpyxl')
    # 提示完成
    messagebox.showinfo("完成", f"处理完成!文件已保存为: {output_file}")
为了使用起来更方便,我直接打包成exe,并加入了输入框和进度条
成品展示

 完成的进度条
 
 生成的文件
 
效果
 
 成品的工具在这。
 提取码 Eb5K
附 virustotal查毒报告
 
 












![[C++]多态与虚函数](https://i-blog.csdnimg.cn/direct/a17bada2b7934967944d6f8985a5545c.png)






