从一段文本中提取关键短语的最佳方法是什么?我正在编写一个关键字提取工具:类似这样的东西.我找到了一些用于Python和Perl的库来提取n-gram,但是我在Node中写这个,所以我需要一个JavaScript解决方案.如果没有任何现有的JavaScript库,有人可以解释如何执行此操作,以便我自己编写吗?
我喜欢这个想法,所以我已经实现了它:见下文(包括描述性评论).
预览:http://fiddle.jshell.net/WsKMx/
/*@author Rob W, created on 16-17 September 2011, on request for Stackoverflow (http://stackoverflow.com/q/7085454/938089) * Modified on 17 juli 2012, fixed IE bug by replacing [,] with [null] * This script will calculate words. For the simplicity and efficiency, * there's only one loop through a block of text. * A 100% accuracy requires much more computing power, which is usually unnecessary **/ var text = "A quick brown fox jumps over the lazy old bartender who said 'Hi!' as a response to the visitor who presumably assaulted the maid's brother, because he didn't pay his debts in time. In time in time does really mean in time. Too late is too early? Nonsense! 'Too late is too early' does not make any sense."; var atLeast = 2; // Show results with at least .. occurrences var numWords = 5; // Show statistics for one to .. words var ignoreCase = true; // Case-sensitivity var REallowedChars = /[^a-zA-Z'\-]+/g; // RE pattern to select valid characters. Invalid characters are replaced with a whitespace var i, j, k, textlen, len, s; // Prepare key hash var keys = [null]; //"keys[0] = null", a word boundary with length zero is empty var results = []; numWords++; //for human logic, we start counting at 1 instead of 0 for (i=1; i<=numWords; i++) { keys.push({}); } // Remove all irrelevant characters text = text.replace(REallowedChars, " ").replace(/^\s+/,"").replace(/\s+$/,""); // Create a hash if (ignoreCase) text = text.toLowerCase(); text = text.split(/\s+/); for (i=0, textlen=text.length; i= atLeast) results[k].push({"word":i, "count":key[i]}); } } // Result parsing var outputHTML = []; // Buffer data. This data is used to create a table using `.innerHTML` var f_sortAscending = function(x,y) {return y.count - x.count;}; for (k=1; k '+k+' word'+(k==1?"":"s")+''); for (i=0,len=words.length; i " + words[i].word + " " + words[i].count + " " + Math.round(words[i].count/textlen*10000)/100 + "% "); // textlen defined at the top // The relative occurence has a precision of 2 digits. } } outputHTML = '"; document.getElementById("RobW-sample").innerHTML = outputHTML; /* CSS: #wordAnalysis td{padding:1px 3px 1px 5px} .num-words-header{font-weight:bold;border-top:1px solid #000} HTML: */
' + ' ' + 'Phrase Count Relativity ' +outputHTML.join(" ")+ "