From ec3dac6dbb65f2785b43b356a197eb006a28c082 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino <p.zaino@zfpsystems.com>
Date: Fri, 1 Mar 2024 19:10:25 +0000
Subject: [PATCH] Improved quality for meta tags extraction and fixed a bug in
 removeSource command

---
 cmd/addSource/main.go                    |  15 +++
 cmd/removeSource/main.go                 | 148 +++++++----------------
 pkg/crawler/crawler.go                   |  44 +++++--
 pkg/crawler/keywords.go                  |   4 +-
 pkg/crawler/keywords_test.go             |   2 +-
 pkg/crawler/types.go                     |  10 +-
 pkg/database/postgresql-setup-v1.2.pgsql |  15 +--
 services/api/console.go                  |   9 +-
 services/api/helpers.go                  |  12 ++
 9 files changed, 126 insertions(+), 133 deletions(-)

diff --git a/cmd/addSource/main.go b/cmd/addSource/main.go
index c07fa906..f4c3c7e8 100644
--- a/cmd/addSource/main.go
+++ b/cmd/addSource/main.go
@@ -21,6 +21,7 @@ import (
 	"flag"
 	"fmt"
 	"log"
+	"strings"
 
 	cfg "github.com/pzaino/thecrowler/pkg/config"
 
@@ -35,6 +36,9 @@ func insertWebsite(db *sql.DB, url string) error {
 	// SQL statement to insert a new website
 	stmt := `INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')`
 
+	// Normalize the URL
+	url = normalizeURL(url)
+
 	// Execute the SQL statement
 	_, err := db.Exec(stmt, url)
 	if err != nil {
@@ -45,6 +49,17 @@ func insertWebsite(db *sql.DB, url string) error {
 	return nil
 }
 
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+func normalizeURL(url string) string {
+	// Trim spaces
+	url = strings.TrimSpace(url)
+	// Trim trailing slash
+	url = strings.TrimRight(url, "/")
+	// Convert to lowercase
+	url = strings.ToLower(url)
+	return url
+}
+
 func main() {
 	configFile := flag.String("config", "config.yaml", "Path to the configuration file")
 	url := flag.String("url", "", "URL of the website to add")
diff --git a/cmd/removeSource/main.go b/cmd/removeSource/main.go
index a83b5179..7b975849 100644
--- a/cmd/removeSource/main.go
+++ b/cmd/removeSource/main.go
@@ -23,141 +23,83 @@ import (
 	"log"
 
 	cfg "github.com/pzaino/thecrowler/pkg/config"
-
-	"github.com/lib/pq"
 )
 
 var (
 	config cfg.Config
 )
 
-// removeSite removes a site from the database along with its associated entries in other tables.
-// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
-// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
-// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
-// If any error occurs during the process, the transaction is rolled back and the error is returned.
-func removeSite(db *sql.DB, siteURL string) error {
-	// Start a transaction
-	tx, err := db.Begin()
-	if err != nil {
-		return err
-	}
-
-	// Delete from Sources
-	err = deleteFromSources(tx, siteURL)
-	if err != nil {
-		return err
-	}
-
-	// Find and delete associated entries in SearchIndex, MetaTags, and KeywordIndex
-	err = deleteAssociatedEntries(tx, siteURL)
-	if err != nil {
-		return err
-	}
-
-	// Commit the transaction
-	err = tx.Commit()
-	if err != nil {
-		return err
-	}
-
-	return nil
+// ConsoleResponse represents the structure of the response
+// returned by the console API (addSource/removeSOurce etc.).
+type ConsoleResponse struct {
+	Message string `json:"message"`
 }
 
-func deleteFromSources(tx *sql.Tx, siteURL string) error {
-	_, err := tx.Exec(`DELETE FROM Sources WHERE url = $1`, siteURL)
-	if err != nil {
-		rollbackTransaction(tx)
-		return err
-	}
-	return nil
-}
+func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
+	var results ConsoleResponse
+	results.Message = "Failed to remove the source"
 
-func deleteAssociatedEntries(tx *sql.Tx, siteURL string) error {
-	indexIDs, err := getAssociatedIndexIDs(tx, siteURL)
+	// First, get the source_id for the given URL to ensure it exists and to use in cascading deletes if necessary
+	var sourceID int64
+	err := tx.QueryRow("SELECT source_id FROM Sources WHERE url = $1", sourceURL).Scan(&sourceID)
 	if err != nil {
-		rollbackTransaction(tx)
-		return err
+		return results, err
 	}
 
-	err = deleteFromSearchIndex(tx, siteURL)
+	// Proceed with deleting the source using the obtained source_id
+	_, err = tx.Exec("DELETE FROM Sources WHERE source_id = $1", sourceID)
 	if err != nil {
-		rollbackTransaction(tx)
-		return err
-	}
-
-	err = deleteFromMetaTags(tx, indexIDs)
-	if err != nil {
-		rollbackTransaction(tx)
-		return err
+		err2 := tx.Rollback() // Rollback in case of error
+		if err2 != nil {
+			return ConsoleResponse{Message: "Failed to delete source"}, err2
+		}
+		return ConsoleResponse{Message: "Failed to delete source and related data"}, err
 	}
-
-	err = deleteFromKeywordIndex(tx, indexIDs, siteURL)
+	_, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();")
 	if err != nil {
-		rollbackTransaction(tx)
-		return err
+		err2 := tx.Rollback() // Rollback in case of error
+		if err2 != nil {
+			return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err2
+		}
+		return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err
 	}
-
-	return nil
-}
-
-func getAssociatedIndexIDs(tx *sql.Tx, siteURL string) ([]int, error) {
-	var indexIDs []int
-	rows, err := tx.Query(`SELECT index_id FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
+	_, err = tx.Exec("SELECT cleanup_orphaned_netinfo();")
 	if err != nil {
-		return nil, err
-	}
-	defer rows.Close()
-
-	for rows.Next() {
-		var indexID int
-		if err := rows.Scan(&indexID); err != nil {
-			return nil, err
+		err2 := tx.Rollback() // Rollback in case of error
+		if err2 != nil {
+			return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err2
 		}
-		indexIDs = append(indexIDs, indexID)
+		return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err
 	}
 
-	return indexIDs, nil
+	results.Message = "Source and related data removed successfully"
+	return results, nil
 }
 
-func deleteFromSearchIndex(tx *sql.Tx, siteURL string) error {
-	_, err := tx.Exec(`DELETE FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
+// removeSite removes a site from the database along with its associated entries in other tables.
+// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
+// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
+// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
+// If any error occurs during the process, the transaction is rolled back and the error is returned.
+func removeSite(db *sql.DB, siteURL string) error {
+	// Start a transaction
+	tx, err := db.Begin()
 	if err != nil {
 		return err
 	}
-	return nil
-}
 
-func deleteFromMetaTags(tx *sql.Tx, indexIDs []int) error {
-	for _, id := range indexIDs {
-		_, err := tx.Exec(`DELETE FROM MetaTags WHERE index_id = $1`, id)
-		if err != nil {
-			return err
-		}
-	}
-	return nil
-}
-
-func deleteFromKeywordIndex(tx *sql.Tx, indexIDs []int, siteURL string) error {
-	_, err := tx.Exec(`
-		DELETE FROM KeywordIndex
-		WHERE index_id = ANY($1)
-		AND NOT EXISTS (
-			SELECT 1 FROM SearchIndex
-			WHERE index_id = KeywordIndex.index_id
-			AND source_id != (SELECT source_id FROM Sources WHERE url = $2)
-		)`, pq.Array(indexIDs), siteURL)
+	_, err = removeSource(tx, siteURL)
 	if err != nil {
 		return err
 	}
-	return nil
-}
 
-func rollbackTransaction(tx *sql.Tx) {
-	err := tx.Rollback()
+	// Commit the transaction
+	err = tx.Commit()
 	if err != nil {
-		log.Printf("Error rolling back transaction: %v\n", err)
+		return err
 	}
+
+	return nil
 }
 
 func main() {
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index d65b94bb..cbb03df0 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -606,11 +606,11 @@ func insertOrUpdateWebObjects(tx *sql.Tx, indexID int64, pageInfo PageInfo) erro
 
 	// Step 1: Insert into WebObjects
 	err := tx.QueryRow(`
-		INSERT INTO WebObjects (object_url, object_hash, object_content)
+		INSERT INTO WebObjects (object_html, object_hash, object_content)
 		VALUES ($1, $2, $3)
 		ON CONFLICT (object_hash) DO UPDATE
 		SET object_content = EXCLUDED.object_content
-		RETURNING object_id;`, pageInfo.URL, hash, pageInfo.BodyText).Scan(&objID)
+		RETURNING object_id;`, pageInfo.HTML, hash, pageInfo.BodyText).Scan(&objID)
 	if err != nil {
 		return err
 	}
@@ -714,8 +714,11 @@ func insertHTTPInfo(tx *sql.Tx, indexID int64, httpInfo *httpi.HTTPDetails) erro
 // It takes a transaction, index ID, and a map of meta tags as parameters.
 // Each meta tag is inserted into the MetaTags table with the corresponding index ID, name, and content.
 // Returns an error if there was a problem executing the SQL statement.
-func insertMetaTags(tx *sql.Tx, indexID int64, metaTags map[string]string) error {
-	for name, content := range metaTags {
+func insertMetaTags(tx *sql.Tx, indexID int64, metaTags []MetaTag) error {
+	for _, metatag := range metaTags {
+		name := metatag.Name
+		content := metatag.Content
+
 		var metatagID int64
 
 		// Try to find the metatag ID first
@@ -884,6 +887,10 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
 	title, _ := webPage.Title()
 	summary := doc.Find("meta[name=description]").AttrOr("content", "")
 	bodyText := doc.Find("body").Text()
+	// transform tabs into spaces
+	bodyText = strings.Replace(bodyText, "\t", " ", -1)
+	// remove excessive spaces in bodyText
+	bodyText = strings.Join(strings.Fields(bodyText), " ")
 
 	metaTags := extractMetaTags(doc)
 
@@ -893,6 +900,7 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
 		Title:        title,
 		Summary:      summary,
 		BodyText:     bodyText,
+		HTML:         htmlContent,
 		MetaTags:     metaTags,
 		DetectedLang: detectLang(webPage),
 		DetectedType: inferDocumentType(currentURL),
@@ -946,14 +954,14 @@ func inferDocumentType(url string) string {
 
 // extractMetaTags is a function that extracts meta tags from a goquery.Document.
 // It iterates over each "meta" element in the document and retrieves the "name" and "content" attributes.
-// The extracted meta tags are stored in a map[string]string, where the "name" attribute is the key and the "content" attribute is the value.
-// The function returns the map of extracted meta tags.
-func extractMetaTags(doc *goquery.Document) map[string]string {
-	metaTags := make(map[string]string)
+// The extracted meta tags are stored in a []MetaTag, where the "name" attribute is the key and the "content" attribute is the value.
+// The function returns the slice of extracted meta tags.
+func extractMetaTags(doc *goquery.Document) []MetaTag {
+	var metaTags []MetaTag
 	doc.Find("meta").Each(func(_ int, s *goquery.Selection) {
 		if name, exists := s.Attr("name"); exists {
 			content, _ := s.Attr("content")
-			metaTags[name] = content
+			metaTags = append(metaTags, MetaTag{Name: name, Content: content})
 		}
 	})
 	return metaTags
@@ -984,7 +992,7 @@ func extractLinks(htmlContent string) []string {
 	doc.Find("a").Each(func(index int, item *goquery.Selection) {
 		linkTag := item
 		link, _ := linkTag.Attr("href")
-		link = strings.TrimSpace(link)
+		link = normalizeURL(link, 0)
 		if link != "" && IsValidURL(link) {
 			links = append(links, link)
 		}
@@ -992,6 +1000,22 @@ func extractLinks(htmlContent string) []string {
 	return links
 }
 
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+/* flags:
+   1: Convert to lowercase
+*/
+func normalizeURL(url string, flags uint) string {
+	// Trim spaces
+	url = strings.TrimSpace(url)
+	// Trim trailing slash
+	url = strings.TrimRight(url, "/")
+	// Convert to lowercase
+	if flags&1 == 1 {
+		url = strings.ToLower(url)
+	}
+	return url
+}
+
 // isExternalLink checks if the link is external (aka outside the Source domain)
 // isExternalLink checks if linkURL is external to sourceURL based on domainLevel.
 func isExternalLink(sourceURL, linkURL string, domainLevel int) bool {
diff --git a/pkg/crawler/keywords.go b/pkg/crawler/keywords.go
index d9382376..ca93f347 100644
--- a/pkg/crawler/keywords.go
+++ b/pkg/crawler/keywords.go
@@ -196,8 +196,8 @@ func extractKeywords(pageInfo PageInfo) []string {
 	content := normalizeText(doc.Text())
 
 	// Extract from meta tags (keywords and description)
-	keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "keywords")...)
-	keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "description")...)
+	keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "keywords")...)
+	keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "description")...)
 
 	// Extract from main content
 	contentKeywords := extractContentKeywords(content)
diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go
index 8ec6d157..b3f90e4a 100644
--- a/pkg/crawler/keywords_test.go
+++ b/pkg/crawler/keywords_test.go
@@ -128,7 +128,7 @@ func TestExtractKeywords(t *testing.T) {
 	keywords := make(map[string]string)
 	keywords["keywords"] = testData
 	pageInfo := PageInfo{
-		MetaTags: keywords,
+		Keywords: keywords,
 	}
 
 	tests := []struct {
diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
index 654cb9de..ad398a78 100644
--- a/pkg/crawler/types.go
+++ b/pkg/crawler/types.go
@@ -30,6 +30,12 @@ type SeleniumInstance struct {
 	Config  cfg.Selenium
 }
 
+// MetaTag represents a single meta tag, including its name and content.
+type MetaTag struct {
+	Name    string
+	Content string
+}
+
 // PageInfo represents the information of a web page.
 type PageInfo struct {
 	URL          string             // The URL of the web page.
@@ -37,7 +43,9 @@ type PageInfo struct {
 	Title        string             // The title of the web page.
 	Summary      string             // A summary of the web page content.
 	BodyText     string             // The main body text of the web page.
-	MetaTags     map[string]string  // The meta tags of the web page.
+	HTML         string             // The HTML content of the web page.
+	MetaTags     []MetaTag          // The meta tags of the web page.
+	Keywords     map[string]string  // The keywords of the web page.
 	DetectedType string             // The detected document type of the web page.
 	DetectedLang string             // The detected language of the web page.
 	NetInfo      *neti.NetInfo      // The network information of the web page.
diff --git a/pkg/database/postgresql-setup-v1.2.pgsql b/pkg/database/postgresql-setup-v1.2.pgsql
index 27ad9b32..c35ab9dd 100644
--- a/pkg/database/postgresql-setup-v1.2.pgsql
+++ b/pkg/database/postgresql-setup-v1.2.pgsql
@@ -96,11 +96,11 @@ CREATE TABLE IF NOT EXISTS WebObjects (
     object_id BIGSERIAL PRIMARY KEY,
     created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL,
     last_updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-    object_url TEXT NOT NULL, -- The original URL where the object was found
     object_link TEXT NOT NULL DEFAULT 'db', -- The link to where the object is stored if not in the DB
     object_type VARCHAR(255) NOT NULL DEFAULT 'text/html', -- The type of the object, for fast searches
     object_hash VARCHAR(64) UNIQUE NOT NULL, -- SHA256 hash of the object for fast comparison and uniqueness
-    object_content TEXT -- The actual content of the object, nullable if stored externally
+    object_content TEXT, -- The actual content of the object, nullable if stored externally
+    object_html TEXT -- The HTML content of the object, nullable if stored externally
 );
 
 -- MetaTags table stores the meta tags from the SearchIndex
@@ -502,17 +502,6 @@ BEGIN
 END
 $$;
 
--- Creates an index for the WebObjects object_url column
-DO $$
-BEGIN
-    -- Check if the index already exists
-    IF NOT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = 'idx_webobjects_object_url') THEN
-        -- Create the index if it doesn't exist
-        CREATE INDEX idx_webobjects_object_url ON WebObjects(object_url text_pattern_ops);
-    END IF;
-END
-$$;
-
 -- Creates an index for the WebObjects object_link column
 DO $$
 BEGIN
diff --git a/services/api/console.go b/services/api/console.go
index 96fa2934..2af42384 100644
--- a/services/api/console.go
+++ b/services/api/console.go
@@ -29,11 +29,14 @@ func performAddSource(query string, qType int) (ConsoleResponse, error) {
 	var sqlQuery string
 	var sqlParams addSourceRequest
 	if qType == 1 {
-		sqlParams.URL = query
+		sqlParams.URL = normalizeURL(query)
 		sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')"
 	} else {
 		// extract the parameters from the query
 		extractAddSourceParams(query, &sqlParams)
+		// Normalize the URL
+		sqlParams.URL = normalizeURL(sqlParams.URL)
+		// Prepare the SQL query
 		sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status, restricted, disabled, flags, config) VALUES ($1, NULL, $2, $3, $4, $5, $6)"
 	}
 
@@ -189,7 +192,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
 		}
 		return ConsoleResponse{Message: "Failed to delete source and related data"}, err
 	}
-	_, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();", sourceID)
+	_, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();")
 	if err != nil {
 		err2 := tx.Rollback() // Rollback in case of error
 		if err2 != nil {
@@ -197,7 +200,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
 		}
 		return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err
 	}
-	_, err = tx.Exec("SELECT cleanup_orphaned_netinfo();", sourceID)
+	_, err = tx.Exec("SELECT cleanup_orphaned_netinfo();")
 	if err != nil {
 		err2 := tx.Rollback() // Rollback in case of error
 		if err2 != nil {
diff --git a/services/api/helpers.go b/services/api/helpers.go
index 218f552d..7aab6b62 100644
--- a/services/api/helpers.go
+++ b/services/api/helpers.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"net/http"
+	"strings"
 
 	cmn "github.com/pzaino/thecrowler/pkg/common"
 )
@@ -49,3 +50,14 @@ func getQType(expr bool) int {
 	}
 	return 0
 }
+
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+func normalizeURL(url string) string {
+	// Trim spaces
+	url = strings.TrimSpace(url)
+	// Trim trailing slash
+	url = strings.TrimRight(url, "/")
+	// Convert to lowercase
+	url = strings.ToLower(url)
+	return url
+}