利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-20 19:27:41 所属栏目:大数据 来源:网络整理
导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d
用来做文本处理,如判断是否为空、匹配字符等 package grid.common; public class TextUtils { public static boolean isCnLetter(char c) { return c >= 0x4E00 && c <= 0x9FCB; } public static boolean isNumeric(char c) { return c >= '0' && c <= '9'; } public static boolean isEnLetter(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } public static boolean match(String src,int off,String dest) { int len = dest.length(); int srcLen = src.length(); for (int i = 0; i < len; i++) { if (srcLen <= off + i) { return false; } if (dest.charAt(i) != src.charAt(off + i)) { return false; } } return true; } public static boolean isBlank(String str) { return null == str || str.isEmpty() || str.trim().isEmpty(); } } Tree.java语法树 package grid.common; public class Tree<T> extends Node<T> { public Tree(T value) { super(value); } } dic里边包含CnDictionary类 CnDictionary.java词典处理 package grid.text.dic; import grid.common.CountMap; import grid.common.TextDatReader; import grid.common.TextUtils; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class CnDictionary { private final String COMMON_WORD_DIC_PATH = "common.dic"; /** * This text data is for character statistic. Change to your own if you * like. */ private final String COMMON_LETTER_RESOURCE_PATH = "text.dat"; private Set<String> dictionary = new HashSet<String>(); private CountMap<Character> letterCountMap = new CountMap<Character>(); private int totalLetterCount; private static CnDictionary instance; //单例模式 public static CnDictionary Instance() { if (null == instance) { try { instance = new CnDictionary(); } catch (IOException e) { e.printStackTrace(); } } return instance; } private CnDictionary() throws IOException { initWordDic(); initLetterCountMap(); } private void initLetterCountMap() throws IOException { String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat final int len = letterResource.length(); char c; for (int i = 0; i < len; i++) { c = letterResource.charAt(i); if (TextUtils.isCnLetter(c)) { letterCountMap.increase(c); } } totalLetterCount = letterCountMap.count(); } private void initWordDic() throws IOException { String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic final int len = bytes.length(); String s = ""; char c; for (int i = 0; i < len; i++) { c = bytes.charAt(i); if ('n' == c || 'r' == c || 0 == c) { if (!TextUtils.isBlank(s)) { dictionary.add(s.trim()); } s = ""; } else { s += c; } if (0 == c) { break; } } } public boolean contains(String word) { return dictionary.contains(word); } public double rate(char c) { return (double) letterCountMap.get(c) / totalLetterCount; } public int size() { return dictionary.size(); } } evolutionEntropyJudger.java计算熵值 package grid.text.evolution; import grid.common.CountMap; import grid.common.TextUtils; import grid.text.index.Pos; import grid.text.index.TextIndexer; public class EntropyJudger { private TextIndexer indexer; /** * A word least appeared count */ private static int LEAST_COUNT_THRESHOLD = 5; //阈值 /** * Threshold for solid rate calculated by word appeared count and every * single letter. * * The smaller this values is,more new words you will get,but with less * accuracy. The greater this value is,less new words you will get,but * with high accuracy. */ private static double SOLID_RATE_THRESHOLD = 0.018; /** * Threshold for entropy value calculated by candidate word prefix character * count and suffix character count * * The smaller this values is,but * with high accuracy. */ private static double ENTROPY_THRESHOL = 1.92; public EntropyJudger(TextIndexer indexer) { this.indexer = indexer; } public boolean judge(String candidate) { double solidRate = getSolidRate(candidate); if (solidRate < SOLID_RATE_THRESHOLD) { return false; } double entropy = getEntropy(candidate); if (entropy < ENTROPY_THRESHOL) { return false; } return true; } private double getEntropy(String candidate) { Pos pos = new Pos(candidate); CountMap<Character> frontCountMap = new CountMap<Character>(); CountMap<Character> backCountMap = new CountMap<Character>(); final int candidateLen = candidate.length(); int off = 0; char c; double rate,frontEntropy = 0,backEntropy = 0; while (indexer.find(pos).isFound()) { off = pos.getPos(); c = indexer.charAt(off - 1); if (TextUtils.isCnLetter(c)) { frontCountMap.increase(c); } c = indexer.charAt(off + candidateLen); if (TextUtils.isCnLetter(c)) { backCountMap.increase(c); } } for (char key : frontCountMap.keySet()) { rate = (double) frontCountMap.get(key) / frontCountMap.count(); frontEntropy -= rate * Math.log(rate); } for (char key : backCountMap.keySet()) { rate = (double) backCountMap.get(key) / backCountMap.count(); backEntropy -= rate * Math.log(rate); } return frontEntropy > backEntropy ? backEntropy : frontEntropy; } /** * @param candidate * @return */ public double getSolidRate(String candidate) { final int candidateLen = candidate.length(); if (candidateLen < 2) { return 1; } final int count = indexer.count(candidate); double rate = 1; if (count < LEAST_COUNT_THRESHOLD) { return 0; } for (int i = 0; i < candidateLen; i++) { rate *= (double) count / indexer.count("" + candidate.charAt(i)); } return Math.pow(rate,1D / candidateLen) * Math.sqrt(candidateLen); } public void setIndexer(TextIndexer indexer) { this.indexer = indexer; } } NewWordDiscover.java(编辑:晋中站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |