-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdocIndex.js
More file actions
27 lines (25 loc) · 946 Bytes
/
docIndex.js
File metadata and controls
27 lines (25 loc) · 946 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
var fs = require('fs');
var html_strip = require('htmlstrip-native');
exports.indexOneDoc=function(htmlDoc2Index, cbFunc) {
var oneDocIndex = {};
var text = "";
// console.log(htmlDoc2Index)
htmlFileContent=fs.readFileSync(htmlDoc2Index,"utf8")
//console.log(htmlFileContent)
var options ={
include_script : false, // include the content of <script> tags
include_style : false, // include the content of <style> tags
compact_whitespace : true // compact consecutive '\s' whitespace into single char
}
var text = html_strip.html_strip(htmlFileContent,options)
var tokens = tokenizer.tokenize(text);
for (var i=0;i<tokens.length;i++){
var oneWord = tokens[i];
if(!(oneWord in oneDocIndex)) {
oneDocIndex[oneWord]=[]
console.log( "herer ....")
}
oneDocIndex[oneWord].concat(i);
}
cbFunc(oneDocIndex)
};