I wanted to throw this idea out there to see what people thought.
There has been a lot of discussion about integrating full text search into couch and possibly implementing full text search in Erlang. Would it be worth investigating the use of CouchDB's !MapReduce functionality to implement a full text indexer? I whipped together a short example using views. It implements a simple white space tokenizer in !JavaScript, emits each token with it's doc id and position, and reduces each token to a list of doc ids and positions.
Here is the map function:
function(doc) { var tokenEmit = function(token) { emit([token.value,token.field], [this._id,token.position]); } var whiteSpaceAnalyzer = function(str, field) { // Returns tokens split by white space // token: { value: tokenString, position: [0,10] } var len = str.length; var tokenPositions = new Array(); var startPosition = null; var isTokenChar = function(Char) { if (Char === ' ' || Char === '\t' || Char === '\n') return false; return true; } for(var i=0; i < len; i++) { if(startPosition == null) { if(isTokenChar(str[i])) { // start of word startPosition = i; if( i+1 == len ) { // end of string tokenPositions[tokenPositions.length] = [startPosition, i+1]; } } } else { if(!isTokenChar(str[i])) { // end of word tokenPositions[tokenPositions.length] = [startPosition, i]; startPosition = null; // reset startPosition continue; } if( i+1 == len ) { // end of string tokenPositions[tokenPositions.length] = [startPosition, i+1]; } } } var tokenMap = function(tokenPosition) { var token = this.str.substring(tokenPosition[0],tokenPosition[1]); return { value: token, field:this.field, position: tokenPosition }; } return tokenPositions.map(tokenMap,{str:str,field:field}); } var tokens; for (field in doc) { if (typeof(doc[field])=='string') { tokens = whiteSpaceAnalyzer(doc[field], field); tokens.map(tokenEmit, doc); } } }
Here is the reduce function:
function(keys,values,combine) { var result = new Array(); var docHash = new Array(); if(combine) { for(var v in values) { var docObject = values[v][0]; var docId = docObject["doc"]; var positions = docObject["pos"]; if(docHash[docId] == null) { docHash[docId]=new Array(); } docHash[docId] = docHash[docId].concat(positions); } for(var i in docHash){ result[result.length]={doc:i,pos:docHash[i]}; } } else { for(var j in values) { var docId = values[j][0]; var position = values[j][1]; if(docHash[docId] == null) { docHash[docId]=new Array(); } docHash[docId] = docHash[docId].concat([position]); } for(var i in docHash){ result[result.length]={doc:i,pos:docHash[i]}; } } return result; }
The key emitted from the view is ["token","field"]
. This allows terms to be searched per field while also allowing the use of group_level=1 to combine the results of all fields. Combining results of multiple fields currently eliminates the use of positions.
To reduce the amount of information passed during view generation the whiteSpaceAnalyzer function can be moved to the main.js file.
Is this worth pursuing further? _
After pursuing this a little further, here's an expanded version of the above m/r functions, modified to support stemming (via porter), optional case-insensitivity, min-length for tokens, wider whitespace handling (still english-centric though).
Here's the map function: options can be set in the options object at the top, as well whitespace and ignore words, then the porter stemming function (which could get sucked into main.js as noted by Dan). I still have no idea how to do any kind of boolean operations.
options = { stem: porter_stem, min_length: 3, case_sensitive: false, ignore_words: true }; token_chars = [' ', 'h', '.', ',', ':', '\ ', '\ ']; // list of fields to not index // defaults set to ['type'] as a common usecase ignore_fields = ['type']; ignore_words = [ 'about','after','all','also','an','and','another','any','are','as','at','be','because', 'been','before','being','between','both','but','by','came','can','come', 'could','did','do','each','for','from','get','got','has','had','he', 'have','her','here','him','himself','his','how','if','in','into','is', 'it','like','make','many','me','might','more','most','much','must','my', 'never','now','of','on','only','or','other','our','out','over','said', 'same','see','should','since','some','still','such','take','than', 'that','the','their','them','then','there','these','they','this','those', 'through','to','too','under','up','very','was','way','we','well','were', 'what','where','which','while','who','with','would','you','your','a','b', 'c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t', 'u','v','w','x','y','z','0','1','2','3','4','5','6','7','8','9' ]; // Porter stemmer in Javascript. Few comments, but it's easy to follow against // the rules in the original paper, in // // Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, // no. 3, pp 130-137, // // see also <a href="http://www.tartarus.org/~martin/PorterStemmer">http://www.tartarus.org/~martin/PorterStemmer</a> // Release 1 step2list = new Array(); step2list["ational"]="ate"; step2list["tional"]="tion"; step2list["enci"]="ence"; step2list["anci"]="ance"; step2list["izer"]="ize"; step2list["bli"]="ble"; step2list["alli"]="al"; step2list["entli"]="ent"; step2list["eli"]="e"; step2list["ousli"]="ous"; step2list["ization"]="ize"; step2list["ation"]="ate"; step2list["ator"]="ate"; step2list["alism"]="al"; step2list["iveness"]="ive"; step2list["fulness"]="ful"; step2list["ousness"]="ous"; step2list["aliti"]="al"; step2list["iviti"]="ive"; step2list["biliti"]="ble"; step2list["logi"]="log"; step3list = new Array(); step3list["icate"]="ic"; step3list["ative"]=""; step3list["alize"]="al"; step3list["iciti"]="ic"; step3list["ical"]="ic"; step3list["ful"]=""; step3list["ness"]=""; c = "[^aeiou]"; // consonant v = "[aeiouy]"; // vowel C = c + "[^aeiouy]*"; // consonant sequence V = v + "[aeiou]*"; // vowel sequence mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0 meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1 mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1 s_v = "^(" + C + ")?" + v; // vowel in stem function porter_stem(w) { var stem; var suffix; var firstch; var origword = w; if (w.length < 3) { return w; } var re; var re2; var re3; var re4; firstch = w.substr(0,1); if (firstch == "y") { w = firstch.toUpperCase() + w.substr(1); } // Step 1a re = /^(.+?)(ss|i)es$/; re2 = /^(.+?)([^s])s$/; if (re.test(w)) { w = w.replace(re,"$1$2"); } else if (re2.test(w)) { w = w.replace(re2,"$1$2"); } // Step 1b re = /^(.+?)eed$/; re2 = /^(.+?)(ed|ing)$/; if (re.test(w)) { var fp = re.exec(w); re = new RegExp(mgr0); if (re.test(fp[1])) { re = /.$/; w = w.replace(re,""); } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1]; re2 = new RegExp(s_v); if (re2.test(stem)) { w = stem; re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re2.test(w)) { w = w + "e"; } else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) { w = w + "e"; } } } // Step 1c re = /^(.+?)y$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(s_v); if (re.test(stem)) { w = stem + "i"; } } // Step 2 re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step2list[suffix]; } } // Step 3 re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; suffix = fp[2]; re = new RegExp(mgr0); if (re.test(stem)) { w = stem + step3list[suffix]; } } // Step 4 re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/; re2 = /^(.+?)(s|t)(ion)$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); if (re.test(stem)) { w = stem; } } else if (re2.test(w)) { var fp = re2.exec(w); stem = fp[1] + fp[2]; re2 = new RegExp(mgr1); if (re2.test(stem)) { w = stem; } } // Step 5 re = /^(.+?)e$/; if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; re = new RegExp(mgr1); re2 = new RegExp(meq1); re3 = new RegExp("^" + C + v + "[^aeiouwxy]$"); if (re.test(stem) || (re2.test(stem) && !(re3.test(stem)))) { w = stem; } } re = /ll$/; re2 = new RegExp(mgr1); if (re.test(w) && re2.test(w)) { re = /.$/; w = w.replace(re,""); } // and turn initial Y back to y if (firstch == "y") { w = firstch.toLowerCase() + w.substr(1); } return w; } function(doc) { var tokenEmit = function(token) { emit([token.value, token.field], [this._id, token.position]); if(typeof(options.stem) == 'function') emit([options.stem(token.value), token.field], [this._id, token.position]); } var stripIgnoreFields = function(token) { return (ignore_fields.indexOf(token.field.toLowerCase()) < 0); } var stripIgnoreWords = function(token) { return (ignore_words.indexOf(token.value) < 0); } var whiteSpaceAnalyzer = function(str, field) { // Returns tokens split by white space // token: {value:tokenString, position:[0,10]} var len = str.length; var tokenPositions = new Array(); var startPosition = null; var isTokenChar = function(chr) { return !(token_chars.indexOf(chr) >= 0); } for(var i=0; i < len; i++) { if(startPosition == null) { if(isTokenChar(str[i])) { // start of word startPosition = i; if(i+1 == len) tokenPositions[tokenPositions.length] = [startPosition, i+1]; } } else { if(!isTokenChar(str[i])) { // end of word tokenPositions[tokenPositions.length] = [startPosition, i]; startPosition = null; // reset startPosition continue; } if(i+1 == len) { // end of string tokenPositions[tokenPositions.length] = [startPosition, i+1]; } } } // kill all tokens shorter than min_length var newPositions = new Array(); for(var i=0; i<tokenPositions.length; i++){ if (tokenPositions[i][1] - tokenPositions[i][0] >= options.min_length) newPositions.push(tokenPositions[i]); } tokenPositions = newPositions; var tokenMap = function(tokenPosition) { var token = this.str.substring(tokenPosition[0], tokenPosition[1]); if (!options.case_sensitive) token = token.toLowerCase(); return {value:token, field:this.field, position:tokenPosition}; } return tokenPositions.map(tokenMap, {str:str, field:field}); } var tokens; for (field in doc){ if (field[0] != '_' && typeof(doc[field]) == 'string'){ tokens = whiteSpaceAnalyzer(doc[field], field); tokens = tokens.filter(stripIgnoreFields); tokens = tokens.filter(stripIgnoreWords); tokens.map(tokenEmit, doc); } } }
Here is the reduce function (mostly unchanged from above):
function(keys, values, combine) { var result = new Array(); var docHash = new Array(); if(combine) { for(var v in values){ var docObject = values[v][0]; var docId = docObject["doc"]; var positions = docObject["pos"]; if(docHash[docId] == null) docHash[docId] = new Array(); docHash[docId] = docHash[docId].concat(positions); } for(var i in docHash) { result[result.length]={doc:i, pos:docHash[i]}; } } else { for(var j in values) { var docId = values[j][0]; var position = values[j][1]; if(docHash[docId] == null) docHash[docId] = new Array(); docHash[docId] = docHash[docId].concat([position]); } for(var i in docHash) { result[result.length] = {doc:i, pos:docHash[i]}; } } return result; }