-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinvertedIndex.js
More file actions
118 lines (99 loc) · 3.67 KB
/
invertedIndex.js
File metadata and controls
118 lines (99 loc) · 3.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
vi: sw=4 ts=4 expandtab
*/
var fs = require('fs');
var html_strip = require('htmlstrip-native');
var allDocs = [];
var documentCount = 0;
//-------------------------------------
var walk = function(dir, done) {
var results = [];
fs.readdir(dir, function(err, list) {
if (err) return done(err);
var pending = list.length;
if (!pending) return done(null, results);
list.forEach(function(file) {
file = dir + '/' + file;
fs.stat(file, function(err, stat) {
if (stat && stat.isDirectory()) {
walk(file, function(err, res) {
results = results.concat(res);
if (!--pending) done(null, results);
});
} else {
results.push(file);
if (!--pending) done(null, results);
}
});
});
});
};
//------------------------------------
var invertedIndex = {
/* word:{
doc1:[1,4,],
doc2:[56,76]
},
word2:{
doc1:[5,2],
doc3:[4,6]
},*/
};
exports.invertedIndex = invertedIndex;
//------------------------------------
var indexOneDoc=function(htmlDoc2Index, cbFunc) {
var oneDocIndex=new Array()
htmlFileContent=fs.readFileSync(htmlDoc2Index,"utf8")
var options ={
include_script : false, // include the content of <script> tags
include_style : false, // include the content of <style> tags
compact_whitespace : true // compact consecutive '\s' whitespace into single char
}
var text = html_strip.html_strip(htmlFileContent,options)
var str = text.replace(/[^\w\s]|_/g, "").replace(/\s+/g, " ");
//RegEx: removes everything except alphanumeric characters and whitespace, then collapses multiple adjacent characters to single spaces.
var tokens = str.split(" ");
for (var i=0;i<tokens.length;i++){
var oneWord = tokens[i];
if(!(oneWord in oneDocIndex)) {
oneDocIndex[oneWord] = new Array()
}
var len= oneDocIndex[oneWord].length
oneDocIndex[oneWord][len]=i
}
//oneDocIndex returns the index generated from one document
cbFunc(htmlDoc2Index, oneDocIndex)
};
//---------- MAIN HERE --------
var uniqWordCount = 0;
exports.doIndex = function(dir,cbIndexDone){
walk(dir, function(err, allDocs) {
if (err) throw err;
var retCnt=0
//------will be called when a document is indexed (@callback)--------
var docIndexDone = function( htmlDoc2Index, oneDocIndex) {
//console.log(oneDocIndex.length , "distinct words in", htmlDoc2Index)
// process.exit(0)
for (var word in oneDocIndex) {
if(!(word in invertedIndex)) {
invertedIndex[word] = {}
uniqWordCount+=1;
}
invertedIndex[word][htmlDoc2Index]=oneDocIndex[word];
}
retCnt +=1
if (retCnt == documentCount) {
console.log("Total Words: " + uniqWordCount + " Total documents: " + documentCount )
console.log(invertedIndex);
console.log("Total Words: " + uniqWordCount + " Total documents: " + documentCount )
cbIndexDone();
}
}
//------
var documentCount = allDocs.length;
//fire document indexing jobs for each document (async)
allDocs.forEach(function(oneDoc){
indexOneDoc(oneDoc, docIndexDone);//use docIndexOne as callback
});
})
};