From 6bf297d313e122099ff1f551c5231a52026d0d14 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Thu, 29 Feb 2024 20:01:43 +0000
Subject: [PATCH] Minor code porting from Rust to go for the rules engine
execution
---
pkg/crawler/crawler.go | 40 ++++++++++++++++++++++++++++++++--------
pkg/crawler/types.go | 1 +
services/api/main.go | 1 +
3 files changed, 34 insertions(+), 8 deletions(-)
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index 31b5c369..d65b94bb 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -213,13 +213,14 @@ func (ctx *processContext) RefreshSeleniumConnection(sel SeleniumInstance) {
// CrawlInitialURL is responsible for crawling the initial URL of a Source
func (ctx *processContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDriver, error) {
cmn.DebugMsg(cmn.DbgLvlInfo, "Crawling URL: %s", ctx.source.URL)
- pageSource, err := getURLContent(ctx.source.URL, ctx.wd, 0)
+ pageSource, err := getURLContent(ctx.source.URL, ctx.wd, 0, ctx)
if err != nil {
cmn.DebugMsg(cmn.DbgLvlError, "Error getting HTML content: %v", err)
ctx.sel <- sel // Assuming 'sel' is accessible
updateSourceState(*ctx.db, ctx.source.URL, err)
return pageSource, err
}
+
// Handle consent
handleConsent(ctx.wd)
@@ -230,7 +231,7 @@ func (ctx *processContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDr
ctx.GetHTTPInfo(ctx.source.URL)
// Continue with extracting page info and indexing
- pageInfo := extractPageInfo(pageSource)
+ pageInfo := extractPageInfo(pageSource, ctx)
pageInfo.HTTPInfo = ctx.hi
pageInfo.NetInfo = ctx.ni
@@ -827,7 +828,7 @@ func insertKeywordWithRetries(db cdb.Handler, keyword string) (int, error) {
// getURLContent is responsible for retrieving the HTML content of a page
// from Selenium and returning it as a WebDriver object
-func getURLContent(url string, wd selenium.WebDriver, level int) (selenium.WebDriver, error) {
+func getURLContent(url string, wd selenium.WebDriver, level int, ctx *processContext) (selenium.WebDriver, error) {
// Navigate to a page and interact with elements.
err0 := wd.Get(url)
cmd, _ := exi.ParseCmd(config.Crawler.Interval, 0)
@@ -841,13 +842,26 @@ func getURLContent(url string, wd selenium.WebDriver, level int) (selenium.WebDr
} else {
time.Sleep(time.Second * time.Duration((delay + 5))) // Pause to let Home page load
}
+
+ // Run Action Rules if any
+ if ctx.source.Config != nil {
+ // Execute the CROWler rules
+ cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...")
+ /*
+ err = rules.ExecuteActionRules(ctx.source.Config, ctx.wd)
+ if err != nil {
+ cmn.DebugMsg(cmn.DbgLvlError, "Error executing CROWler rules: %v", err)
+ }
+ */
+ }
+
return wd, err0
}
// extractPageInfo is responsible for extracting information from a collected page.
// In the future we may want to expand this function to extract more information
// from the page, such as images, videos, etc. and do a better job at screen scraping.
-func extractPageInfo(webPage selenium.WebDriver) PageInfo {
+func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
htmlContent, _ := webPage.PageSource()
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
if err != nil {
@@ -855,12 +869,22 @@ func extractPageInfo(webPage selenium.WebDriver) PageInfo {
return PageInfo{} // Return an empty struct in case of an error
}
+ // Run scraping rules if any
+ if ctx.source.Config != nil {
+ // Execute the CROWler rules
+ cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...")
+ /*
+ PageInfo.ScrapedData, err = rules.ExecuteScrapingRules(ctx.source.Config, ctx.wd)
+ if err != nil {
+ cmn.DebugMsg(cmn.DbgLvlError, "Error executing CROWler rules: %v", err)
+ }
+ */
+ }
+
title, _ := webPage.Title()
summary := doc.Find("meta[name=description]").AttrOr("content", "")
bodyText := doc.Find("body").Text()
- //containsAppInfo := strings.Contains(bodyText, "app") || strings.Contains(bodyText, "mobile")
-
metaTags := extractMetaTags(doc)
currentURL, _ := webPage.CurrentURL()
@@ -1056,12 +1080,12 @@ func skipURL(processCtx *processContext, id int, url string) bool {
}
func processJob(processCtx *processContext, id int, url string) {
- htmlContent, err := getURLContent(url, processCtx.wd, 1)
+ htmlContent, err := getURLContent(url, processCtx.wd, 1, processCtx)
if err != nil {
cmn.DebugMsg(cmn.DbgLvlError, "Worker %d: Error getting HTML content for %s: %v\n", id, url, err)
return
}
- pageCache := extractPageInfo(htmlContent)
+ pageCache := extractPageInfo(htmlContent, processCtx)
pageCache.sourceID = processCtx.source.ID
indexPage(*processCtx.db, url, pageCache)
extractedLinks := extractLinks(pageCache.BodyText)
diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
index de4ce4f2..654cb9de 100644
--- a/pkg/crawler/types.go
+++ b/pkg/crawler/types.go
@@ -42,6 +42,7 @@ type PageInfo struct {
DetectedLang string // The detected language of the web page.
NetInfo *neti.NetInfo // The network information of the web page.
HTTPInfo *httpi.HTTPDetails // The HTTP header information of the web page.
+ ScrapedData string // The scraped data from the web page.
}
// Screenshot represents the metadata of a webpage screenshot
diff --git a/services/api/main.go b/services/api/main.go
index dc1b8795..f4170288 100644
--- a/services/api/main.go
+++ b/services/api/main.go
@@ -77,6 +77,7 @@ func main() {
}
}
+// initAPIv1 initializes the API v1 handlers
func initAPIv1() {
searchHandlerWithMiddlewares := SecurityHeadersMiddleware(RateLimitMiddleware(http.HandlerFunc(searchHandler)))
scrImgSrchHandlerWithMiddlewares := SecurityHeadersMiddleware(RateLimitMiddleware(http.HandlerFunc(scrImgSrchHandler)))