利用文本挖掘技术来找出网络中的“小鲜词”
发布时间:2021-01-20 19:27:41 所属栏目:大数据 来源:网络整理
导读:开始之前,先看一下从人人网中发现的90后用户爱用的词 是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜) 项目结构 当然,text.dat和common.d
抽词程序 package grid.text.evolution; import grid.common.TextUtils; import grid.text.dic.CnDictionary; import grid.text.index.CnPreviewTextIndexer; import grid.text.index.TextIndexer; import grid.text.selector.CnTextSelector; import grid.text.selector.TextSelector; import java.util.HashSet; import java.util.Set; public class NewWordDiscover { private CnDictionary dictionary; /** * Minimum word length */ private final static int MIN_CANDIDATE_LEN = 2; /** * Maximum word length */ private final static int MAX_CANDIDATE_LEN = 6; private static Set<Character> structuralLetterSet = new HashSet<Character>(); private static char[] structuralLetters = { '我','你','您','他','她','谁','哪','那','这','的','了','着','也','是','有','不','在','与','呢','啊','呀','吧','嗯','哦','哈','呐' }; static { for (char c : structuralLetters) { structuralLetterSet.add(c); } } public NewWordDiscover() { dictionary = CnDictionary.Instance(); } /** * New word discover is based on statistic and entropy,better to sure * document size is in 100kb level,or you may get a unsatisfied result. * * @param document * @return */ public Set<String> discover(String document) { Set<String> set = new HashSet<String>(); TextIndexer indexer = new CnPreviewTextIndexer(document); TextSelector selector = new CnTextSelector(document,MIN_CANDIDATE_LEN,MAX_CANDIDATE_LEN); EntropyJudger judger = new EntropyJudger(indexer); String candidate; while (!selector.end()) { candidate = selector.next(); if (TextUtils.isBlank(candidate)) { continue; } if (structuralLetterSet.contains(candidate.charAt(0)) || structuralLetterSet.contains(candidate.charAt(candidate .length() - 1))) { continue; } // Replace IF clause with "set.contains(candidate)" if you want to // find new word without any dictionary if (dictionary.contains(candidate) || set.contains(candidate)) { selector.select(); } else if (judger.judge(candidate)) { set.add(candidate); } } return set; } } index(编辑:晋中站长网) 【声明】本站内容均来自网络,其相关言论仅代表作者个人观点,不代表本站立场。若无意侵犯到您的权利,请及时与联系站长删除相关内容! |