利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-20 19:27:41 所属栏目:大数据 来源:网络整理
导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d
分词处理,具体看实现 Chunk.javapackage grid.text.participle; import grid.text.dic.CnDictionary; import java.util.List; public class Chunk implements Comparable<Chunk> { private List<String> list; private int len = 0; private double avg = 0; private double variance = 0; public Chunk(List<String> list) { this.list = list; init(); } private void init() { for (String s : list) { len += s.length(); } avg = (double) len / list.size(); for (String s : list) { variance += Math.pow(avg - s.length(),2); } variance = Math.sqrt(variance); } public int getLen() { return len; } public double getAvg() { return avg; } public double getVariance() { return variance; } public String getHead() { if (null == list || list.isEmpty()) { return ""; } return list.get(0); } private int compareDouble(double d1,double d2) { if (d1 - d2 < -0.0000001D) { return 1; } else if (d1 - d2 > 0.0000001D) { return -1; } return 0; } @Override public int compareTo(Chunk o) { if (len != o.len) { return o.len - len; } int d = compareDouble(avg,o.avg); if (0 != d) { return d; } d = compareDouble(variance,o.variance); if (0 != d) { return d; } CnDictionary dictionary = CnDictionary.Instance(); double rateSrc = 0,rateDest = 0; for (String s : list) { if (1 == s.length()) { rateSrc += dictionary.rate(s.charAt(0)); } } for (String s : o.list) { if (1 == s.length()) { rateDest += dictionary.rate(s.charAt(0)); } } return compareDouble(rateSrc,rateDest); } public String toString() { return list.toString(); } } ChunkStream.javapackage grid.text.participle; import grid.common.Node; import grid.common.TextUtils; import grid.common.Tree; import grid.text.dic.CnDictionary; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class ChunkStream { /** * Define the max supposed word length * * You could shorten the value if you don't need too long participle result */ private static final int MAX_WORD_LEN = 7; /** * Define the predict level while execute participle. * * Negligible accuracy will be promoted if you increase this value */ private static final int PREDICT_LEVEL = 3; private static CnDictionary dictionary = CnDictionary.Instance(); public String next(String text,int off) { Tree<String> root = new Tree<String>("ROOT"); recurse(root,off,text,0); List<Node<String>> list = root.getLeaves(); List<Chunk> chunkList = new ArrayList<Chunk>(); for (Node<String> node : list) { chunkList.add(new Chunk(node.getBranchPath())); } Collections.sort(chunkList); return chunkList.get(0).getHead(); } private void recurse(Node<String> node,String text,int predictDeep) { int len = MAX_WORD_LEN + off > text.length() ? text.length() - off : MAX_WORD_LEN; while (predictDeep < PREDICT_LEVEL) { if (len < 1) { return; } String s = text.substring(off,off + len); if (len < 2) { if (!TextUtils.isCnLetter(text.charAt(off))) { break; } recurse(node.add(s),off + 1,predictDeep + 1); } else if (dictionary.contains(s)) { recurse(node.add(s),off + s.length(),predictDeep + 1); } len--; } } } MechanicalParticiple.javapackage grid.text.participle; import grid.common.TextUtils; import java.util.Vector; public class MechanicalParticiple { public Vector<String> partition(String document) { Vector<String> vector = new Vector<String>(); final int docLen = document.length(); int off = 0; char c; String seg = ""; ChunkStream stream = new ChunkStream(); while (off < docLen) { c = document.charAt(off); if (TextUtils.isEnLetter(c) || TextUtils.isNumeric(c)) { seg += c; off++; } else if (TextUtils.isCnLetter(c)) { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; } String word = stream.next(document,off); if (!TextUtils.isBlank(word)) { vector.add(word); off += word.length(); } } else { if (!TextUtils.isBlank(seg)) { vector.add(seg); seg = ""; } /** * TODO: Uncomment the "ELSE IF" clause if you would like to * reserve punctuations */ // else if (!TextUtils.isBlank("" + c)) { vector.add("" + c); } off++; } } if (!TextUtils.isBlank(seg)) { vector.add(seg); } return vector; } } selector(编辑:晋中站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |