词频统计小程序-WordCount.exe
? 最近顶哥为了完成学历提升学业中的小作业,做了一个词频统计的.exe小程序。因为当时做的时候网上的比较少,因此顶哥决定把自己拙略的作品发出来给需要的人提供一种思路,希望各位看官不要dis才好。最后附上源码链接,感兴趣的朋友可以继续优化哦。
二. 先看效果
双击运行,下拉框选择源文件来源,支持本地和网络资源,如图:
本地源文件示例
网络源文件示例
<dependencies> <dependency> <groupId>com.janeluogroupId> <artifactId>ikanalyzerartifactId> <version>2012_u6version> dependency> <dependency> <groupId>junitgroupId> <artifactId>junitartifactId> <version>4.12version> <scope>testscope> dependency> <dependency> <groupId>org.jsoupgroupId> <artifactId>jsoupartifactId> <version>1.11.3version> dependency> dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.pluginsgroupId> <artifactId>maven-surefire-pluginartifactId> <version>2.18.1version> <configuration> <skipTests>trueskipTests> configuration> plugin> <plugin> <groupId>org.apache.maven.pluginsgroupId> <artifactId>maven-assembly-pluginartifactId> <version>2.4.1version> <configuration> <descriptorRefs> <descriptorRef>jar-with-dependenciesdescriptorRef> descriptorRefs> <archive> <manifest> <addClasspath>trueaddClasspath> <mainClass>cn.dintalk.service.WordCountmainClass> manifest> archive> configuration> <executions> <execution> <id>make-assemblyid> <phase>packagephase> <goals> <goal>singlegoal> goals> execution> executions> plugin> plugins> build>
/** * @author Mr.song * @date 2019/10/13 9:26 */ public class WebUtils { /** * 根据url和参数发送get请求 * * @param url * @param param * @return 返回网页内容 */ public static String sendGet(String url, String param) { String result = ""; if (param != null) { url = url + "?" + param; } try { URL realUrl = new URL(url); // 打开和URL之间的连接 HttpURLConnection conn = getHttpURLConnection(realUrl); result = getResponse(conn); } catch (Exception e) { e.printStackTrace(); } return result; } //根据url 获取连接 private static HttpURLConnection getHttpURLConnection(URL realUrl) { StringBuilder sb = new StringBuilder(); sb.append("Mozilla/5.0 (Windows NT 10.0; Win64; x64)"); sb.append(" AppleWrbKit/537.36(KHTML, like Gecko)"); sb.append(" Chrome/72.0.3626.119 Safari/537.36"); HttpURLConnection conn = null; try { // 打开和URL之间的连接 conn = (HttpURLConnection) realUrl.openConnection(); // 设置通用的请求属性 conn.setRequestProperty("accept", "*/*"); conn.setRequestProperty("connection", "Keep-Alive"); conn.setRequestProperty("user-agent", sb.toString()); } catch (IOException e) { e.printStackTrace(); } return conn; } // 根据url连接获取响应 private static String getResponse(HttpURLConnection conn) { // 读取URL的响应 String result = ""; try (InputStream is = conn.getInputStream(); InputStreamReader isr = new InputStreamReader(is, "utf-8"); BufferedReader in = new BufferedReader(isr)) { String line; while ((line = in.readLine()) != null) { result += "\n" + line; } } catch (Exception e) { System.out.println("Err:getResponse()"); e.printStackTrace(); } finally { conn.disconnect(); } // System.out.println("getResponse():" + result.length()); return result; } /** * 解析网页为文本 * * @param html * @return */ public static String parseHtmlToText(String html) { Document document = Jsoup.parse(html); return document.text(); } }
/** * @author Mr.song * @date 2019/10/10 21:12 */ public class IKSUtils { /** * 对文本进行分词 * @param text * @return * @throws Exception */ public static ListgetStringList(String text) throws Exception{ //独立Lucene实现 StringReader re = new StringReader(text); IKSegmenter ik = new IKSegmenter(re, true); Lexeme lex; List s = new ArrayList<>(); while ((lex = ik.next()) != null) { s.add(lex.getLexemeText()); } return s; } /** * 统计词频 * @param wordList * @return */ public static Map wordCount(List wordList){ if (wordList == null) return null; Map result = new HashMap<>(); for (String s : wordList) { Integer count = result.get(s); if (count == null){ result.put(s,1); }else { result.put(s,++count); } } //按照次数排序 result = result .entrySet() .stream() .sorted(Collections.reverseOrder(Map.Entry.comparingByValue())) .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2, LinkedHashMap::new)); return result; } }
https://github.com/MrSonghui/wordCount
将一个jar包打包成.exe文件,这里给一个参考:
https://www.cnblogs.com/xiaoMzjm/p/3879766.html
喜欢的朋友可以关注我的公众号,需要广告托管的朋友可以加QQ哦!