需求分析
- 关键词

 - 统计关键词出现的频率
 
IK分词
进行分词需要引入IK分词器,使用它时需要引入相关的依赖。它能够将搜索的关键字按照日常的使用习惯进行拆分。比如将苹果iphone 手机,拆分为苹果,iphone, 手机。
<dependency>
    <groupId>org.apache.doris</groupId>
    <artifactId>flink-doris-connector-1.17</artifactId>
</dependency>
<dependency>
    <groupId>com.janeluo</groupId>
    <artifactId>ikanalyzer</artifactId>
</dependency>
 
测试代码如下:
public class IkUtil {
    public static void main(String[] args) throws IOException {
        String s = "Apple 苹果15 5G手机";
        StringReader stringReader = new StringReader(s);
        IKSegmenter ikSegmenter = new IKSegmenter(stringReader, true);//第二个参数表示是否再对拆分后的单词再进行拆分,true时表示不在继续拆分
        Lexeme next = ikSegmenter.next();
        while (next!= null) {
            System.out.println(next.getLexemeText());
            next = ikSegmenter.next();
        }
    }
}
 
整体流程
- 创建自定义分词工具类IKUtil,IK是一个分词工具依赖
 - 创建自定义函数类
 - 注册函数
 - 消费kafka DWD页面主题数据并设置水位线
 - 从主流中过滤搜索行为 
  
- page[‘item’] is not null
 - item_type : “keyword”
 - last_page_id: “search”
 
 - 使用分词函数对keyword进行拆分
 - 对keyword进行分组开窗聚合
 - 写出到doris 
  
- 创建doris sink
 - flink需要打开检查点才能将数据写出到doris
 
 

具体实现
import com.atguigu.gmall.realtime.common.base.BaseSQLApp;
import com.atguigu.gmall.realtime.common.constant.Constant;
import com.atguigu.gmall.realtime.common.util.SQLUtil;
import com.atguigu.gmall.realtime.dws.function.KwSplit;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
/**
 * title:
 *
 * @Author 浪拍岸
 * @Create 28/12/2023 上午11:06
 * @Version 1.0
 */
public class DwsTrafficSourceKeywordPageViewWindow extends BaseSQLApp {
    public static void main(String[] args) {
        new DwsTrafficSourceKeywordPageViewWindow().start(10021,4,"dws_traffic_source_keyword_page_view_window");
    }
    @Override
    public void handle(StreamExecutionEnvironment env, TableEnvironment tableEnv, String groupId) {
        //1. 读取主流dwd页面主题数据
        tableEnv.executeSql("create table page_info(\n" +
                "    `common` map<string,string>,\n" +
                "    `page` map<string,string>,\n" +
                "    `ts` bigint,\n" +
                "    `row_time` as to_timestamp_ltz(ts,3),\n" +
                "     WATERMARK FOR row_time AS row_time - INTERVAL '5' SECOND\n" +
                ")" + SQLUtil.getKafkaSourceSQL(Constant.TOPIC_DWD_TRAFFIC_PAGE, groupId));
        //测试是否获取到数据
        //tableEnv.executeSql("select * from page_info").print();
        //2. 筛选出关键字keywords
        Table keywrodTable = tableEnv.sqlQuery("select\n" +
                "    page['item'] keywords,\n" +
                "    `row_time`,\n" +
                "    ts\n" +
                " from page_info\n" +
                " where page['last_page_id'] = 'search'\n" +
                " and page['item_type'] = 'keyword'\n" +
                " and page['item'] is not null");
        tableEnv.createTemporaryView("keywords_table", keywrodTable);
        // 测试是否获取到数据
        //tableEnv.executeSql("select * from keywords_table").print();
        //3. 自定义分词函数并注册
        tableEnv.createTemporarySystemFunction("kwSplit", KwSplit.class );
        //4. 调用分词函数对keywords进行拆分
        Table splitKwTable = tableEnv.sqlQuery("select keywords, keyword, `row_time`" +
                " from keywords_table" +
                " left join lateral Table(kwSplit(keywords)) on true");
        tableEnv.createTemporaryView("split_kw_table", splitKwTable);
        //tableEnv.executeSql("select * from split_kw_table").print();
        //5. 对keyword进行分组开窗聚合
        Table windowAggTable = tableEnv.sqlQuery("select\n" +
                "    keyword,\n" +
                "    cast(tumble_start(row_time,interval '10' second ) as string) wStart,\n" +
                "    cast(tumble_end(row_time,interval '10' second ) as string) wEnd,\n" +
                "    cast(current_date as string)  cur_date,\n" +
                "    count(*) keyword_count\n" +
                "from split_kw_table\n" +
                "group by tumble(row_time, interval '10' second), keyword");
        //tableEnv.createTemporaryView("result_table",table);
        //tableEnv.executeSql("select keyword,keyword_count+1 from result_table").print();
        //6. 写出到doris
        tableEnv.executeSql("create table doris_sink\n" +
                "(\n" +
                "    keyword                STRING,\n" +
                "    wStart                 STRING,\n" +
                "    wEnd                   STRING,\n" +
                "    cur_date               STRING,\n" +
                "    keyword_count          BIGINT\n" +
                ")" + SQLUtil.getDorisSinkSQL(Constant.DWS_TRAFFIC_SOURCE_KEYWORD_PAGE_VIEW_WINDOW));
        windowAggTable.insertInto("doris_sink").execute();
    }
}
                


















