Merge branch 'leveldb'

monbro · Sep 8, 2013 · a63259c · a63259c
2 parents 1861bd1 + 3ff9463
commit a63259c
Show file tree

Hide file tree

Showing 10 changed files with 462 additions and 329 deletions.
diff --git a/README.md b/README.md
@@ -35,12 +35,14 @@ The following examples where given after the system was collecting for about one
 
 * install npm and node if you have not already (http://howtonode.org/introduction-to-npm or http://nodejs.org/)
 * install / start your redis server (http://redis.io/topics/quickstart) on a disk with several free GB
+
 * clone this repo "git clone https://github.com/monbro/opensemanticapi.git"
 * change config if needed in "/config.js"
+* NOTE: depending on what you want (scraper / cronjob to run or http api server, set 'http_server' to true or false)
 * open the repository folder in your console
 * enter "npm install", it will install all dependencies automatically
 * start the node server with "node app.js"
-* now it should print what it is collecting
+* now it should print what it is collecting or what http route is requested
 * the longer it collects data the better the results should be
 * now you can access the relations through your browser like http://localhost:8080/relations/database or direct by accessing your redis server
 
@@ -49,5 +51,9 @@ The following examples where given after the system was collecting for about one
 * improve performance of script
 * allow certain configurations (half done)
 * try to use http://yeoman.io/ with its structure for more compatibility and understanding
+* write tests
+* connect to travis-ci
+* better folder structure
+* write version using level db or similar
 
 This software is published under the MIT-License. See 'license' for more information.
diff --git a/app.js b/app.js
@@ -9,339 +9,44 @@
  * @version 0.1
  */
 
-/* Used variables names
- * ____sites____ = wikipedia page titles which were scraped already
- * ____sites2do____ = wikipedia page titles which are queued to scrape
- * ____all____ = a collection of all ever seen words with a increment number
- */
-
-/** 
- * Modules
- */
-var restify = require('restify');
-var express = require('express');
-var config = require('./config.js');
-var tools = require('./lib/tools');
-var redis = require("redis");
-// var mongoose = require('mongoose');
-var _ = require("underscore");
-var $ = require("jquery");
-
-/** 
- * Objects
- */
-
-// create redis client
-var client = redis.createClient();
-
-// create restify server to server http api
-var server = restify.createServer();
-server.use(restify.bodyParser());
-
-// create restify json client for api requests
-var wikipedia = restify.createJsonClient({
-  url: 'http://'+config.creds.lang+'.wikipedia.org',
-  version: '*'
-});
-
-/** 
- * Run
- */
-
-// start api requests with given keyword
-wikiSearch('database');
-
-/** 
- * Helper functions
- */
-
-/** 
- * function wikiSearch will start the main processes to search for the best wikipedia page for the given string
- *
- * @param string term
- * @return boolean
- */
-function wikiSearch(term) {
-  // do api call
-  wikipedia.get('/w/api.php?action=opensearch&search='+escape(term)+'&format=json&limit=3', function(err, req, res, data) {
-
-    if(typeof data[1] == 'undefined' || typeof data[1][0] == 'undefined') {
-      if(config.creds.debug)
-        console.log('No page found in wikipedia for '+req.path);
-      client.srem('____sites2do____',term);
-      goToNext();
-      return;
-    }
-
-    // get first matching result
-    var firstTitle = data[1][0];
-
-    // set first result as done
-    client.sadd('____sites____', firstTitle, function (err, result) {
-      if(result) {
-        wikiGrab(firstTitle);
-        client.srem('____sites2do____',firstTitle);
-      }
-      else {
-        if(config.creds.debug)
-          console.log(firstTitle+' was crawled already!');
-        goToNext();
-        return false;
-      }
-    });
-
-    // add all sites to queue
-    for (var i = data[1].length - 1; i >= 0; i--) {
-      client.sadd('____sites2do____',data[1][i]);
-    }
-
-  });
-}
-
-/** 
- * function wikiGrab will get the content for the given wikipedia page title
- *
- * @param string title
- * @return boolean
- */
-function wikiGrab(title) {
-  // do the api call
-  wikipedia.get('/w/api.php?rvprop=content&format=json&prop=revisions|categories&rvprop=content&action=query&titles='+escape(title), function(err, req, res, data) {
-    if(typeof data.query == 'undefined') {
-      goToNext();
-      return false;
-    }
-
-    // check if valid content
-    if(typeof data.query.pages[Object.keys(data.query.pages)[0]].revisions == 'undefined') {
-      goToNext();
-      return false;
-    }
-
-    // get the main content of the wikipedia page
-    var rawtext = data.query.pages[Object.keys(data.query.pages)[0]].revisions[0]["*"];
-     // now split the whole content into text blocks
-    var parts = rawtext.split(/\n|\r/);
-    var snippets = [];
-
-    if(config.creds.debug)
-      console.log('going to http://'+config.creds.lang+'.wikipedia.org/wiki/'+title);
-
-    // loop all text blocks and pull these with more than config.creds.min_text_block_length (default: 120) chars
-    for (var i = parts.length - 1; i >= 0; i--) {
-      if(parts[i].length > config.creds.min_text_block_length) {
-        snippets.push(parts[i]);
-      }
-    }
-
-    if(snippets.length > 0) {
-      // give the loop worker something to do
-      loopWorker(snippets);
-    }
-    else {
-      // restart fetch
-      goToNext();
-    }
-
-  });
-}
-
-/** 
- * function loopWorker will process all snippets gently for your system
- *
- * @param array snippets
- * @return
- */
-function loopWorker(snippets) {
-  // when snippetbox is empty, restart fetch
-  if(snippets.length === 0) {
-    if(config.creds.debug)
-      console.log('Count of snippets: '+snippets.length);
-    goToNext();
-    return;
-  }
-
-  // analyze full text block
-  $.when(analyzeText(snippets.pop(),snippets.length)).done(function() {
-    // set a timeout to be gently to the memory and cpu 
-    // (can be changed in the config file)
-    var t=setTimeout(function(){loopWorker(snippets);},config.creds.sleeptime);
-  });
-}
-
-/** 
- * function goToNext will move on to a random element to search for in the queue ____sites2do____ which is stored in redis
- *
- * @param
- * @return
- */
-function goToNext() {
-  if(config.creds.debug)
-    console.log('NEXT');
-  client.srandmember('____sites2do____', function (err, result) {
-    wikiSearch(result);
-  });
-}
-
-/** 
- * function analyzeText will get the content for the given wikipedia page title
- *
- * @param string title
- * @return boolean
- */
-function analyzeText(snippet,counter) {
-
-  // split the text block to words
-  var words = tools.tokenize(snippet);
-
-  if(config.creds.debug)
-      console.log('Count of words in snippet ('+counter+'): '+words.length);
-
-  // create empty object
-  var obj = {};
-
-  var multi = client.multi();
-
-  // loop all words
-  for (var i = words.length - 1; i >= 0; i--) {
-
-    // count all seen words
-    if(typeof obj[words[i].toLowerCase()] == 'undefined')
-      obj[words[i].toLowerCase()] = 1;
-    else
-      obj[words[i].toLowerCase()]++;
-
-    // add every word to the queue to spread the scrape
-    multi.sadd('____sites2do____',words[i].toLowerCase());
-
-    // if(config.creds.debug)
-    //   console.log(words[i].toLowerCase()+'¥ - '+words[j].toLowerCase()+' - '+similar_text(words[i].toLowerCase(),words[j].toLowerCase(),1));
-  }
-
-  var base;
-
-  $.each(obj, function(index, value) {
-
-    // skip if not valid
-    if(typeof index == 'undefined' || typeof index.toLowerCase == 'undefined')
-      return;
-
-    // create new obj from class Base, make sure to work with lowercase only
-    base = new Base(index.toLowerCase());
-
-    // loop all words
-    $.each(obj, function(index2, value2) {
-      if(index != index2) {
-        // add relation, value2 is the counter of how often the word was seen in the recent textblock
-        base.pushRelation(index2.toLowerCase(),value2);
-      }
-    });
-
-    base.save();
-
-    // add to our general 'ALL' collection, to identify the most used words of all
-    multi.sadd('____all____', index.toLowerCase()); // add keyword
-    multi.incrby('____all____'+':'+index.toLowerCase(), value); // track its density
-
-  });
-
-  multi.exec(function(err, replies) {
-      return true;
-  });
-}
-
 /** 
- * function inAButNotInB will remove all items from array a which are in array b
- * depending on underscore.js
- *
- * @param
- * @return
+ * laod config
  */
-function inAButNotInB(A, B) {
-  return _.filter(A, function (d) {
-    return !_.contains(B, d);
-  });
-}
+var config = require('./config');
 
 /** 
- * class Base will get handle database-actions related to one keyword
- *
- * @param string val
- * @return boolean
+ * Basic Objects
  */
-function Base(val) {
 
-  // Store variables
-  var that = this,
-      multi_in = client.multi(), // to pipeline actions for redis
-      res;
+// our basic app object
+var App = function() {};
 
-  // to set the restify response, a bit hacky actually
-  this.setRes = function(val) {
-    res = val;
-  };
+// our basic app
+var app = new App();
 
-  // process the pipelined actions in redis
-  this.save = function() {
-    multi_in.exec();
-  };
+var Model = require('./app/model');
 
-  // get all relationes, without the noise
-  this.getTopRelations = function() {
-    // get most often used keywords (limit 500)
-    client.sort('____all____', "by", "____all____:*", 'LIMIT', 0, 500, 'DESC', "get", "#", function (err1, items1) {
-        // get most often realted keywords for the given keyword
-        client.sort(val, "by", val+":*", 'LIMIT', 0, 120, 'DESC', "get", "#", function (err2, items2) {
-          // remove the noise by removing the most often used keywords
-          doResponse(inAButNotInB(items2,items1),res);
-        });
-    });
-  };
+App.prototype.getModel = function(s) {
+  return new Model(s); 
+};
 
-  // add word and count up
-  this.pushRelation = function(rel, incr) {
-    multi_in.sadd(val, rel);
-    if(typeof incr == 'undefined') {
-      incr = 1;
-    }
-    multi_in.incrby(val+':'+rel, incr);
-  };
-}
+// var test = app.getModel('test');
+// console.log(test.getValue());
 
-/** 
- * function doResponse will send the response to the client
- *
- * @param string data
- * @return res
- */
-function doResponse(data, res) {
-  res.send(data);
-}
+if(!config.creds.http_server) {
+    var Scraper = require("./app/scraping");
+    var scraper = new Scraper();
 
-/** 
- * function getRelations will take action as a router function to deliver all relations to the requested keyword
- *
- * @param string req.params.name
- * @return boolean
- */
-function getRelations(req, res, next) {
-  var base = new Base(req.params.name);
-  base.setRes(res);
-  base.getTopRelations();
+    // Start Cronjob
+    scraper.wikiSearch('database');
 }
 
-/** 
- * Routes
- */
-
-// Set up our routes
-server.get('/relations/:name', getRelations);
-
 /** 
  * Server
  */
 
-// start the server
-server.listen(config.creds.server_port, function() {
-  console.log('%s listening at %s', server.name, server.url);
-});
+// Start HTTP API RESTFUL Server
+if(config.creds.http_server) {
+    var Http = require("./app/http");
+    var http = new Http(); 
+}