UDFToTopN案例
需求:
统计最热门的课程Top10
http://bigdata.com/course/458655.html => 458655
http://bigdata.com/course/458655/2.html?a=b&c=d => 458655_2
解析后再统计课程Top10
1.使用 MockClassData生成数据
package com.cj.bigdata.hive.hiveWork;
import java.io.*;
import java.util.Random;
public class MockClassData {
public static void main(String[] args) throws IOException {
//课程号数组
String words[] = {"123","4354","43541","43542","43543","43544","43545","43546","43547","43548"};
Random random = new Random();
//文件输出流对象
BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File("data/input/classData.txt"))));
//for循环随机生成数据
for (int i = 0; i < 30000; i++) {
bufferedWriter.write("http://bigdata.com/course/" + words[random.nextInt(words.length)]);
if(random.nextInt(4) != 0){
bufferedWriter.write("/" + random.nextInt(3) + ".html?a=b&c=d");
bufferedWriter.newLine();
}else {
bufferedWriter.write(".html");
bufferedWriter.newLine();
}
}
bufferedWriter.flush();
bufferedWriter.close();
}
}
2.创建表
CREATE TABLE `default.class`(
`class_str` string)
3.将数据upload服务器并import到table class
load data local inpath '/home/jackson/data/classData.txt' overwrite into table class;
#检查数据
select * from class limit 10;
4.编写UDFParseClassCode
package com.cj.bigdata.hive.hiveWork;
import org.apache.hadoop.hive.ql.exec.UDF;
public class UDFParseClassCode extends UDF {
/***
*
* http://bigdata.com/course/4354/2.html?a=b&c=d
* http://bigdata.com/course/43548.html
*/
public String evaluate(String str){
StringBuffer buffer = new StringBuffer();
String[] splits01 = str.split(".html");
//http://bigdata.com/course/4354/2或http://bigdata.com/course/43548
String splits02 = splits01[0];
String[] splits03 = splits02.split("/");
//用split后的的最后一块的长度来判断是//http://bigdata.com/course/4354/2还是http://bigdata.com/course/43548
if(splits03[splits03.length - 1].length() == 1){
buffer.append(splits03[splits03.length - 2]);
buffer.append("_");
buffer.append(splits03[splits03.length - 1]);
}else {
buffer.append(splits03[splits03.length - 1]);
}
return buffer.toString();
}
}
5.打包上传生成UDF函数执行sql
5.1上传到$HIVE_HOME/auxlib
CREATE TEMPORARY FUNCTION parseClass AS 'com.cj.bigdata.hive.hiveWork.UDFParseClassCode';
5.2sql完成需求
select
parseClass(class_str) class_code,
count(*) topN
from class
group by parseClass(class_str)
limit 10
;
结果:
class_code topN
123 801
123_0 759
123_1 773
123_2 749
43547 18
43541 721
43541_0 728
43541_1 759
43541_2 754
43542 728