FlinkSQL实践记录4 -- 实时更新的维表如何join
1. 背景
对于实时更新的维表,以什么组件来处理作为FlinkSQL的source维表?HBase?Kafka?或mysql?哪一种方案能得到正确结果?
且需要考虑到事实表和维表关联的时候,是否需要和维表的历史版本关联?还是只关联维表的最新版本?
下文以只关联维表的最新版本为目标进行测试。
2. 实践过程
2.1 采用upsert-kafka作为维表
(1) kafka生产者代码
// 创建消息
DateTimeFormatter dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss.nnnnnnnnn");
for (int i = 2; i < 8; i++) {
JSONObject json1 = new JSONObject();
json1.put("key", i+"");
//json.put("update_time", dtf.format(LocalDateTime.now()));
JSONObject json = new JSONObject();
json.put("id", i+"");
json.put("name", "name444"+i);
ProducerRecord record = new ProducerRecord(
"flinksqldim",
json1.toJSONString(),
json.toJSONString()
);
}
(2) FlinkSQL主体代码
// 创建执行环境
//EnvironmentSettings settings = EnvironmentSettings.inStreamingMode();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表
tableEnv.executeSql(
"CREATE TABLE sensor_source(" +
" id STRING, " +
" name STRING, " +
" o_time TIMESTAMP(3), " +
" WATERMARK FOR o_time AS o_time " +
" ) WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把kafka中数据 映射成输入维表 - 实时变更的维表
tableEnv.executeSql(
"CREATE TABLE dim_source (" +
" id STRING," +
" name STRING," +
" update_time TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL, " +
" WATERMARK FOR update_time AS update_time, " +
" PRIMARY KEY (id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'upsert-kafka'," +
" 'topic' = 'flinksqldim'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlDim'," +
" 'key.format' = 'json'," +
" 'value.format' = 'json')"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT b.name, count(*) as cnt " +
"FROM sensor_source as a " +
"INNER JOIN dim_source as b " +
"on a.id = b.id " +
"where a.id > 3 " +
"group by b.name "
// "order by name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
3. 试错
3.1 使用Regular Joins 常规join
kafka生产者代码
for (int i = 1; i < 10; i++) {
//json.put("update_time", dtf.format(LocalDateTime.now()));
JSONObject json = new JSONObject();
json.put("id", i+"");
json.put("name", "name555"+i);
ProducerRecord record = new ProducerRecord(
"flinksqldim2",
i,
json.toJSONString()
);
// 发送消息
Future future = producer.send(record);
FlinkSQL处理代码
// 创建执行环境
//EnvironmentSettings settings = EnvironmentSettings.inStreamingMode();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
EnvironmentSettings settings = EnvironmentSettings.newInstance()
.useBlinkPlanner()
.inStreamingMode()
.build();
TableEnvironment tableEnv = StreamTableEnvironment.create(env, settings);
// 把kafka中的topic映射成一个输入临时表
tableEnv.executeSql(
"CREATE TABLE sensor_source(" +
"id STRING, " +
"name STRING, " +
"o_time TIMESTAMP(3), " +
" WATERMARK FOR o_time AS o_time " +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldemo'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlCount'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把kafka中数据 映射成输入维表 - 实时变更的维表, 非compacted topic
tableEnv.executeSql(
"CREATE TABLE dim_source ( " +
" id STRING, " +
" name STRING, " +
" update_time TIMESTAMP(3) METADATA FROM 'timestamp' VIRTUAL, " +
" WATERMARK FOR update_time AS update_time " +
") WITH (" +
" 'connector' = 'kafka'," +
" 'topic' = 'flinksqldim2'," +
" 'properties.bootstrap.servers' = 'ip:port'," +
" 'properties.group.id' = 'flinksqlDim'," +
" 'scan.startup.mode' = 'earliest-offset'," +
" 'format' = 'json')"
);
// 把Mysql中的表映射为一个输出临时表
String mysql_sql = "CREATE TABLE mysql_sink (" +
" name STRING," +
" cnt BIGINT," +
" PRIMARY KEY (name) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'count_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")";
tableEnv.executeSql(mysql_sql);
// 插入数据
TableResult tableResult = tableEnv.executeSql(
"INSERT INTO mysql_sink " +
"SELECT b.name, count(*) as cnt " +
"FROM sensor_source a " +
"JOIN dim_source b " +
"on a.id = b.id " +
"where a.id > 3 " +
"group by b.name "
);
System.out.println(tableResult.getJobClient().get().getJobStatus());
维表流更新了几次数据后,结果表count_info中数据错乱
3.2 mysql表作为维表
tableEnv.executeSql(
"CREATE TEMPORARY TABLE mysql_source (" +
" id STRING," +
" name STRING," +
" update_time TIMESTAMP(3)," +
" PRIMARY KEY (id) NOT ENFORCED" +
") WITH (" +
" 'connector' = 'jdbc'," +
" 'url' = 'jdbc:mysql://ip:port/kafka?serverTimezone=UTC'," +
" 'table-name' = 'user_info'," +
" 'username' = 'xxx'," +
" 'password' = 'xxx'" +
")"
);
报错如下:
Event-Time Temporal Table Join requires both primary key and row time attribute in versioned table, but no row time attribute can be found.
可见,mysql直接作为维表不能做temporal join , 那如果加上CDC呢?是不是就可以了?更新中。。。