前言
最近接到一个需求,就是在国税局的网页中输入
【发票代码】、【*发票号码】、【*开票日期】、【*开具金额(不含税)】、【*验证码】
然后将此发票进行下载,但是【验证码】真的是拦路虎啊,我也查询了好多大佬的论文,有的是通过JS逆向(我一个小白根本看不懂),有的是通过打码平台,我自己也注册了一个,确实可以,但是还是对国税局的验证码无效,因为国税局的验证码,有颜色区别,有的不要颜色,如下图,所以打码平台是能全部识别出来,但是无法进行颜色识别
所以我决定当代码执行到这里,就进行 input 手动输入,俗称人工解码,笑死
展示视频:
解决思路
1、目录结构
文件夹:【数据源】:用来存放发票信息
注意:时间格式:如2022-12-12 应写成:20221212
chromedriver.exe:selenium谷歌驱动器
根据自己谷歌版本下载:点我官网下载

2、读取数据源(代码如下)
'''
每一行转成列表,个人习惯
'''
df = pd.read_excel('./数据源/'+os.listdir("./数据源/")[0], sheet_name=0, dtype=str, keep_default_na='')
data = df.values.tolist()
3、selenium驱动浏览器(代码如下)
chrome_options = Options()
chrome_driver = "./chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
driver.get("https://inv-veri.chinatax.gov.cn/")
4、奉上完整版代码
import pandas as pd
import time,os
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
# cd C:\Program Files (x86)\Google\Chrome\Application & chrome.exe --remote-debugging-port=9222 --user-data-dir="D:\selenum\AutomationProfile"
# url : https://inv-veri.chinatax.gov.cn/
t1 = time.time()
df = pd.read_excel('./数据源/'+os.listdir("./数据源/")[0], sheet_name=0, dtype=str, keep_default_na='')
data = df.values.tolist()
chrome_options = Options()
# chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
chrome_driver = "./chromedriver.exe"
driver = webdriver.Chrome(chrome_driver, chrome_options=chrome_options)
driver.get("https://inv-veri.chinatax.gov.cn/")
count2 = 0
for i in data[3:4]:
    # 发票代码
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '//input[@id="fpdm"]')))
    time.sleep(0.2)
    fpdm = driver.find_element_by_xpath('//input[@id="fpdm"]')
    time.sleep(0.2)
    fpdm.clear()
    fpdm.send_keys(i[0])
    time.sleep(0.1)
    # 发票号码
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '//input[@id="fphm"]')))
    time.sleep(0.2)
    fphm = driver.find_element_by_xpath('//input[@id="fphm"]')
    time.sleep(0.2)
    fphm.clear()
    fphm.send_keys(i[1])
    time.sleep(0.1)
    # 开票日期
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '//input[@id="kprq"]')))
    time.sleep(0.2)
    kprq = driver.find_element_by_xpath('//input[@id="kprq"]')
    time.sleep(0.2)
    kprq.clear()
    print(i[2])
    # 20221201
    kprq.send_keys(i[2].split(" ")[0].replace('-',""))
    # 20221201
    kprq.send_keys(i[2].split(" ")[0].replace('-', ""))
    time.sleep(0.2)
    # 开具金额
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '//input[@id="kjje"]')))
    time.sleep(0.2)
    kjje = driver.find_element_by_xpath('//input[@id="kjje"]')
    time.sleep(0.2)
    kjje.clear()
    time.sleep(0.1)
    kjje.send_keys(i[3])
    time.sleep(0.1)
    # 验证码
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '//input[@id="yzm"]')))
    time.sleep(0.2)
    yzm = driver.find_element_by_xpath('//input[@id="yzm"]')
    time.sleep(0.2)
    yzm.clear()
    yzm2 = input("请输入验证码:")
    time.sleep(0.2)
    yzm.send_keys(yzm2)
    time.sleep(0.1)
    # 查验
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '(//button[contains(text(),"查")])[2]')))
    time.sleep(0.1)
    cy = driver.find_element_by_xpath('(//button[contains(text(),"查")])[2]')
    time.sleep(0.1)
    cy.click()
    time.sleep(0.5)
    driver.switch_to.frame(driver.find_element_by_id("dialog-body"))
    # download
    time.sleep(0.1)
    dd = driver.find_element_by_xpath('//button[@id="pdfDownNow"]')
    time.sleep(0.5)
    dd.click()
    time.sleep(0.5)
    driver.switch_to.default_content()
    time.sleep(0.2)
    # down()
    windows = driver.window_handles
    driver.switch_to.window(windows[-1])
    time.sleep(0.2)
    WebDriverWait(driver, 5, 0.5).until(
        EC.presence_of_element_located((By.XPATH, '(//div[@class="button"])[3]')))
    time.sleep(0.2)
    dd2 = driver.find_element_by_xpath('(//div[@class="button"])[3]')
    time.sleep(0.2)
    dd2.click()
    time.sleep(0.25)
    driver.close()
    time.sleep(0.8)
    windows = driver.window_handles
    driver.switch_to.window(windows[0])
    # 回到下载页面,点击关闭
    driver.switch_to.frame(driver.find_element_by_id("dialog-body"))
    # download
    time.sleep(0.1)
    dd = driver.find_element_by_xpath('//button[@id="closebt"]')
    time.sleep(0.25)
    dd.click()
    time.sleep(0.2)
    driver.switch_to.default_content()
    time.sleep(0.2)
    time.sleep(1)























