From 6bf297d313e122099ff1f551c5231a52026d0d14 Mon Sep 17 00:00:00 2001 From: Paolo Fabio Zaino Date: Thu, 29 Feb 2024 20:01:43 +0000 Subject: [PATCH] Minor code porting from Rust to go for the rules engine execution --- pkg/crawler/crawler.go | 40 ++++++++++++++++++++++++++++++++-------- pkg/crawler/types.go | 1 + services/api/main.go | 1 + 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index 31b5c369..d65b94bb 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -213,13 +213,14 @@ func (ctx *processContext) RefreshSeleniumConnection(sel SeleniumInstance) { // CrawlInitialURL is responsible for crawling the initial URL of a Source func (ctx *processContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDriver, error) { cmn.DebugMsg(cmn.DbgLvlInfo, "Crawling URL: %s", ctx.source.URL) - pageSource, err := getURLContent(ctx.source.URL, ctx.wd, 0) + pageSource, err := getURLContent(ctx.source.URL, ctx.wd, 0, ctx) if err != nil { cmn.DebugMsg(cmn.DbgLvlError, "Error getting HTML content: %v", err) ctx.sel <- sel // Assuming 'sel' is accessible updateSourceState(*ctx.db, ctx.source.URL, err) return pageSource, err } + // Handle consent handleConsent(ctx.wd) @@ -230,7 +231,7 @@ func (ctx *processContext) CrawlInitialURL(sel SeleniumInstance) (selenium.WebDr ctx.GetHTTPInfo(ctx.source.URL) // Continue with extracting page info and indexing - pageInfo := extractPageInfo(pageSource) + pageInfo := extractPageInfo(pageSource, ctx) pageInfo.HTTPInfo = ctx.hi pageInfo.NetInfo = ctx.ni @@ -827,7 +828,7 @@ func insertKeywordWithRetries(db cdb.Handler, keyword string) (int, error) { // getURLContent is responsible for retrieving the HTML content of a page // from Selenium and returning it as a WebDriver object -func getURLContent(url string, wd selenium.WebDriver, level int) (selenium.WebDriver, error) { +func getURLContent(url string, wd selenium.WebDriver, level int, ctx *processContext) (selenium.WebDriver, error) { // Navigate to a page and interact with elements. err0 := wd.Get(url) cmd, _ := exi.ParseCmd(config.Crawler.Interval, 0) @@ -841,13 +842,26 @@ func getURLContent(url string, wd selenium.WebDriver, level int) (selenium.WebDr } else { time.Sleep(time.Second * time.Duration((delay + 5))) // Pause to let Home page load } + + // Run Action Rules if any + if ctx.source.Config != nil { + // Execute the CROWler rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...") + /* + err = rules.ExecuteActionRules(ctx.source.Config, ctx.wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error executing CROWler rules: %v", err) + } + */ + } + return wd, err0 } // extractPageInfo is responsible for extracting information from a collected page. // In the future we may want to expand this function to extract more information // from the page, such as images, videos, etc. and do a better job at screen scraping. -func extractPageInfo(webPage selenium.WebDriver) PageInfo { +func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo { htmlContent, _ := webPage.PageSource() doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { @@ -855,12 +869,22 @@ func extractPageInfo(webPage selenium.WebDriver) PageInfo { return PageInfo{} // Return an empty struct in case of an error } + // Run scraping rules if any + if ctx.source.Config != nil { + // Execute the CROWler rules + cmn.DebugMsg(cmn.DbgLvlDebug, "Executing CROWler rules...") + /* + PageInfo.ScrapedData, err = rules.ExecuteScrapingRules(ctx.source.Config, ctx.wd) + if err != nil { + cmn.DebugMsg(cmn.DbgLvlError, "Error executing CROWler rules: %v", err) + } + */ + } + title, _ := webPage.Title() summary := doc.Find("meta[name=description]").AttrOr("content", "") bodyText := doc.Find("body").Text() - //containsAppInfo := strings.Contains(bodyText, "app") || strings.Contains(bodyText, "mobile") - metaTags := extractMetaTags(doc) currentURL, _ := webPage.CurrentURL() @@ -1056,12 +1080,12 @@ func skipURL(processCtx *processContext, id int, url string) bool { } func processJob(processCtx *processContext, id int, url string) { - htmlContent, err := getURLContent(url, processCtx.wd, 1) + htmlContent, err := getURLContent(url, processCtx.wd, 1, processCtx) if err != nil { cmn.DebugMsg(cmn.DbgLvlError, "Worker %d: Error getting HTML content for %s: %v\n", id, url, err) return } - pageCache := extractPageInfo(htmlContent) + pageCache := extractPageInfo(htmlContent, processCtx) pageCache.sourceID = processCtx.source.ID indexPage(*processCtx.db, url, pageCache) extractedLinks := extractLinks(pageCache.BodyText) diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go index de4ce4f2..654cb9de 100644 --- a/pkg/crawler/types.go +++ b/pkg/crawler/types.go @@ -42,6 +42,7 @@ type PageInfo struct { DetectedLang string // The detected language of the web page. NetInfo *neti.NetInfo // The network information of the web page. HTTPInfo *httpi.HTTPDetails // The HTTP header information of the web page. + ScrapedData string // The scraped data from the web page. } // Screenshot represents the metadata of a webpage screenshot diff --git a/services/api/main.go b/services/api/main.go index dc1b8795..f4170288 100644 --- a/services/api/main.go +++ b/services/api/main.go @@ -77,6 +77,7 @@ func main() { } } +// initAPIv1 initializes the API v1 handlers func initAPIv1() { searchHandlerWithMiddlewares := SecurityHeadersMiddleware(RateLimitMiddleware(http.HandlerFunc(searchHandler))) scrImgSrchHandlerWithMiddlewares := SecurityHeadersMiddleware(RateLimitMiddleware(http.HandlerFunc(scrImgSrchHandler)))