Qt5 网页标题、关键词提取工具Findyou
一、程序运行
 运行界面
 
 辅助功能,可用于将扫描器的扫描结果转换为url
 
二、所涉及的重要知识点
 1、Qt爬取https的网页
 来自宇龍_
 https://blog.csdn.net/qq_45809384/article/details/122049295?spm=1001.2014.3001.5506
 
 打包完成后,把这两个dll补充了就可以
 
 配合下面这段代码使用
 
2、对301、302的重定向进行跟踪,加入这一行代码就可以
 
 项目结构
 
源代码
 getTitleFromUrl.pro
#-------------------------------------------------
#
# Project created by QtCreator 2022-11-23T22:53:34
#
#-------------------------------------------------
QT       += core gui
QT += network
greaterThan(QT_MAJOR_VERSION, 4): QT += widgets
TARGET = getTitleFromUrl
TEMPLATE = app
# The following define makes your compiler emit warnings if you use
# any feature of Qt which as been marked as deprecated (the exact warnings
# depend on your compiler). Please consult the documentation of the
# deprecated API in order to know how to port your code away from it.
DEFINES += QT_DEPRECATED_WARNINGS
# You can also make your code fail to compile if you use deprecated APIs.
# In order to do so, uncomment the following line.
# You can also select to disable deprecated APIs only up to a certain version of Qt.
#DEFINES += QT_DISABLE_DEPRECATED_BEFORE=0x060000    # disables all the APIs deprecated before Qt 6.0.0
SOURCES += \
        main.cpp \
        mainwindow.cpp \
    robots.cpp \
    form.cpp
HEADERS += \
        mainwindow.h \
    robots.h \
    form.h
FORMS += \
        mainwindow.ui \
    form.ui
DISTFILES +=
RESOURCES += \
    myicon.qrc
 
form.h
#ifndef FORM_H
#define FORM_H
#include <QWidget>
#include<QIcon>
#include<QDebug>
#include<QStringList>
namespace Ui {
class Form;
}
class Form : public QWidget
{
    Q_OBJECT
public:
    explicit Form(QWidget *parent = 0);
    ~Form();
private slots:
    void on_pushButton_clicked();
private:
    Ui::Form *ui;
};
#endif // FORM_H
 
mainwindow.h
#ifndef MAINWINDOW_H
#define MAINWINDOW_H
#include <QMainWindow>
#include <QFile>
#include<QIcon>
#include"robots.h"
#include"form.h"
namespace Ui {
class MainWindow;
}
class MainWindow : public QMainWindow
{
    Q_OBJECT
public:
    explicit MainWindow(QWidget *parent = 0);
    ~MainWindow();
private slots:
    void on_pushButton_start_clicked();
    void on_pushButton_clicked();
private:
    Ui::MainWindow *ui;
};
#endif // MAINWINDOW_H
 
robots.h
#ifndef ROBOTS_H
#define ROBOTS_H
#endif // ROBOTS_H
#include<QCoreApplication>
#include<QRegularExpression>
#include<QRegularExpressionMatch>
#include<QRegularExpressionMatchIterator>
#include<QString>
#include<QDebug>
#include <QCoreApplication>
#include<QtCore>
#include<QNetworkAccessManager>
#include<QUrl>
#include<QNetworkRequest>
#include<QNetworkReply>
#include<QObject>
#include<QTextCodec>
QString Robots(QString url);//爬取页面
QString RegularExpression(QString HTML,QString re,QString fenzu);//正则提取爬到的页面
 
form.cpp
#include "form.h"
#include "ui_form.h"
Form::Form(QWidget *parent) :
    QWidget(parent),
    ui(new Ui::Form)
{
    ui->setupUi(this);
    this->setWindowIcon(QIcon("://bgjzicon.png"));
}
Form::~Form()
{
    delete ui;
}
void Form::on_pushButton_clicked()
{
    qDebug()<<ui->textEdit_xieyi->toPlainText();
    QStringList xieyiList=ui->textEdit_xieyi->toPlainText().split("\n");
    xieyiList.removeFirst();
    xieyiList.removeLast();
    QStringList ipList=ui->textEdit_ip->toPlainText().split("\n");
    ipList.removeFirst();
    ipList.removeLast();
    QStringList portList=ui->textEdit_port->toPlainText().split("\n");
    portList.removeFirst();
    portList.removeLast();
    QString url="";
    for(int i=0;i<xieyiList.size();i++)
    {
        url=url+xieyiList[i]+"://"+ipList[i]+":"+portList[i]+"\n";
    }
    ui->textEdit_url->setText(url);
}
 
main.cpp
#include "mainwindow.h"
#include <QApplication>
#include<QTextCodec>
#include "robots.h"
int main(int argc, char *argv[])
{
    QApplication a(argc, argv);
    MainWindow w;
    w.show();
    return a.exec();
}
 
mainwindow.cpp
#include "mainwindow.h"
#include "ui_mainwindow.h"
MainWindow::MainWindow(QWidget *parent) :
    QMainWindow(parent),
    ui(new Ui::MainWindow)
{
    ui->setupUi(this);
    this->setWindowIcon(QIcon("://bgjzicon.png"));
}
MainWindow::~MainWindow()
{
    delete ui;
}
void MainWindow::on_pushButton_start_clicked()
{
    QString alltitleresult="";
    QString allkeywordresult="以下页面内含有关键词:";
    ui->textEdit_title->setText(alltitleresult);
    ui->textEdit_h1->setText(allkeywordresult);
    int count=0;
    //这一段是用来匹配出每一个url,可以增加一些对输入格式的兼容性...........................................................
    QRegularExpression Re("(?<url>http[s]{0,1}.*?://.*?)fengefu");
    QString urls=ui->textEdit_url->toPlainText();
    qDebug()<<"原始的:"<<urls<<endl;
    //排除url重定向的链接打乱顺序 如 http://xx.xxx.xx.x/login.php?redirect=http://xxx.xx.xx/
    urls.replace("=http","1");
    //去除\r\n
    urls.remove("\n");
    urls.remove("\r");
     urls.remove("\t");
    //这样可以使用fengefu有效分割出每个url,适应不同的输入格式
    urls.replace("http://","fengefuhttp://");
    urls.replace("https://","fengefuhttps://");
    //在末尾加上分隔符这样可以兼容最后一个url,使得最后一个url得到匹配
    urls=urls+"fengefu";
    qDebug()<<urls;
    //....................................................................................................................
       if(ui->lineEdit_keyword->text()!="")//输入了关键字
     {
       qDebug()<<"输入了关键字"<<endl;
       QRegularExpressionMatchIterator Matchs=Re.globalMatch(urls);
       QRegularExpressionMatch match=Matchs.next();
       QString oneUrl=match.captured("url");//提取每一个url
       qDebug()<<"提取到"<<oneUrl<<endl;
       //单独爬取第一个.......................................................................
       QString HTML=Robots(oneUrl);
       QString title_re="(<title.*?>(?<title>.*?)</title>)";
       QString titleresult=RegularExpression(HTML,title_re,"title");
       QString keyword=ui->lineEdit_keyword->text();
       QString keyword_re="(?<keyword>"+keyword+")";
       QString keywordresult=RegularExpression(HTML,keyword_re,"keyword");
       //qDebug()<<"关键词正则"<<keyword_re<<endl;
       if(keywordresult!="keyword标签值:")//匹配到关键词了
       {
           allkeywordresult=oneUrl;
       }
       alltitleresult=alltitleresult+titleresult+"\n";
       ui->textEdit_title->setText(alltitleresult);
       ui->textEdit_h1->setText(allkeywordresult);
       //qDebug()<<"已检查数:"<<++count;
       ui->label_jindu->setText(QString::number(++count));
     //......................................................................................
       while(Matchs.hasNext()==true)
       {
           match=Matchs.next();
           oneUrl=match.captured("url");
           qDebug()<<"提取到"<<oneUrl<<endl;
           QString HTML=Robots(oneUrl);
           QString title_re="(<title.*?>(?<title>.*?)</title>)";
           QString titleresult=RegularExpression(HTML,title_re,"title");
           QString keyword_re="(?<keyword>"+keyword+")";
           QString keywordresult=RegularExpression(HTML,keyword_re,"keyword");
           if(keywordresult!="keyword标签值:")//匹配到关键词了
           {
               allkeywordresult=allkeywordresult+"\n"+oneUrl;
           }
           alltitleresult=alltitleresult+titleresult+"\n";
           ui->textEdit_title->setText(alltitleresult);
           ui->textEdit_h1->setText(allkeywordresult);
           //滚动条置底,方便观察实时结果
            ui->textEdit_title->moveCursor(QTextCursor::End);
            ui->textEdit_h1->moveCursor(QTextCursor::End);
          // qDebug()<<"已检查数:"<<++count;
           ui->label_jindu->setText(QString::number(++count));
       }
     }
       else//未输入关键字,仅匹配title标签
     {
           qDebug()<<"未输入关键字"<<endl;
           QRegularExpressionMatchIterator Matchs=Re.globalMatch(urls);
           QRegularExpressionMatch match=Matchs.next();
           QString oneUrl=match.captured("url");//提取每一个url
           qDebug()<<"提取到1"<<oneUrl<<endl;
           //单独爬取第一个.......................................................................
           QString HTML=Robots(oneUrl);
           QString title_re="(<title.*?>(?<title>.*?)</title>)";
           qDebug()<<"爬到"<<HTML<<endl;
           QString titleresult=RegularExpression(HTML,title_re,"title");
           alltitleresult=alltitleresult+titleresult+"\n";
           ui->textEdit_title->setText(alltitleresult);
           ui->label_jindu->setText(QString::number(++count));
         //......................................................................................
           while(Matchs.hasNext()==true)
           {
               match=Matchs.next();
               oneUrl=match.captured("url");
               qDebug()<<"提取到2"<<oneUrl<<endl;
               QString HTML=Robots(oneUrl);
               QString title_re="(<title.*?>(?<title>.*?)</title>)";
              qDebug()<<"爬到2"<<HTML<<endl;
               QString titleresult=RegularExpression(HTML,title_re,"title");
               qDebug()<<"结果:"<<titleresult<<endl;
               alltitleresult=alltitleresult+titleresult+"\n";
               ui->textEdit_title->setText(alltitleresult);
               //滚动条置底,方便观察实时结果
                ui->textEdit_title->moveCursor(QTextCursor::End);
              // qDebug()<<"已检查数:"<<++count;
               ui->label_jindu->setText(QString::number(++count));
           }
     }
}
void MainWindow::on_pushButton_clicked()
{
    Form *pinjie=new Form;
    pinjie->show();
}
 
robots.cpp
#include "robots.h"
QString Robots(QString url)//爬取页面
{
    QUrl URL=url;
    QNetworkAccessManager manager;
    QEventLoop Loop;
    QNetworkRequest request=QNetworkRequest(URL);
    //设置请求头,主要是user-agent字段,不然爬不到有些url,比如百度
    request.setRawHeader("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:107.0) Gecko/20100101 Firefox/107.0");
    request.setRawHeader("Accept","*/*");
    request.setRawHeader("Accept-Language","zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
    //这个对于浏览器提醒的那种不安全,是否继续访问的是可行的。会自动去继续浏览的,估计那个只是浏览器自己的特性,直接代码去访问的话,不涉及那个提示,因为不经过浏览器
    //实验后确定FollowRedirectsAttribute可以实现当状态码为301、302时,会自动根据响应包的Location值进行跳转,可以进行多次跳转
    //多次跳转测试url http://39.129.231.7:85
    //测试url http://39.129.50.68:8000 ,响应包Location: /web/index.html,为相对路径,实验确认可自动跳转
    //测试url http://39.129.48.54:8090 ,响应包Location: http://39.129.48.54:8090/login.php,为完整路径,实验确认可自动跳转
    request.setAttribute(QNetworkRequest::FollowRedirectsAttribute,true);
    //这一段是兼容ssl的,这样才可以进行https请求
    //.......................................................................
    QSslConfiguration config = request.sslConfiguration();
    config.setPeerVerifyMode(QSslSocket::VerifyNone);
    config.setProtocol(QSsl::TlsV1SslV3);
    request.setSslConfiguration(config);
   //........................................................................
    QNetworkReply *reply=manager.get(request);
    QObject::connect(reply,SIGNAL(finished()),&Loop,SLOT(quit()));
    QTimer::singleShot(10000, &Loop, &QEventLoop::quit);//30秒无响应退出消息循环机制,不然有的url直接访问不到,就会直接卡死,不继续访问后面的url
    Loop.exec();
    QString HtmlText=reply->readAll();
    return HtmlText;
}
QString RegularExpression(QString  HTML,QString re,QString fenzu)//正则提取爬到的页面正文
{
    QString TextAfterRe=fenzu+"标签值:";
    QRegularExpression Re(re);
    QRegularExpressionMatchIterator Matchs=Re.globalMatch(HTML);
    QRegularExpressionMatch match=Matchs.next();
    TextAfterRe=TextAfterRe+match.captured(fenzu);//title标签
    TextAfterRe=TextAfterRe;
    while(Matchs.hasNext()==true)
    {
        match=Matchs.next();
        QString temp=match.captured(fenzu);
        TextAfterRe=TextAfterRe+temp;
    }
    return TextAfterRe;
}
                


















