From ec3dac6dbb65f2785b43b356a197eb006a28c082 Mon Sep 17 00:00:00 2001
From: Paolo Fabio Zaino
Date: Fri, 1 Mar 2024 19:10:25 +0000
Subject: [PATCH] Improved quality for meta tags extraction and fixed a bug in
removeSource command
---
cmd/addSource/main.go | 15 +++
cmd/removeSource/main.go | 148 +++++++----------------
pkg/crawler/crawler.go | 44 +++++--
pkg/crawler/keywords.go | 4 +-
pkg/crawler/keywords_test.go | 2 +-
pkg/crawler/types.go | 10 +-
pkg/database/postgresql-setup-v1.2.pgsql | 15 +--
services/api/console.go | 9 +-
services/api/helpers.go | 12 ++
9 files changed, 126 insertions(+), 133 deletions(-)
diff --git a/cmd/addSource/main.go b/cmd/addSource/main.go
index c07fa906..f4c3c7e8 100644
--- a/cmd/addSource/main.go
+++ b/cmd/addSource/main.go
@@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"log"
+ "strings"
cfg "github.com/pzaino/thecrowler/pkg/config"
@@ -35,6 +36,9 @@ func insertWebsite(db *sql.DB, url string) error {
// SQL statement to insert a new website
stmt := `INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')`
+ // Normalize the URL
+ url = normalizeURL(url)
+
// Execute the SQL statement
_, err := db.Exec(stmt, url)
if err != nil {
@@ -45,6 +49,17 @@ func insertWebsite(db *sql.DB, url string) error {
return nil
}
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+func normalizeURL(url string) string {
+ // Trim spaces
+ url = strings.TrimSpace(url)
+ // Trim trailing slash
+ url = strings.TrimRight(url, "/")
+ // Convert to lowercase
+ url = strings.ToLower(url)
+ return url
+}
+
func main() {
configFile := flag.String("config", "config.yaml", "Path to the configuration file")
url := flag.String("url", "", "URL of the website to add")
diff --git a/cmd/removeSource/main.go b/cmd/removeSource/main.go
index a83b5179..7b975849 100644
--- a/cmd/removeSource/main.go
+++ b/cmd/removeSource/main.go
@@ -23,141 +23,83 @@ import (
"log"
cfg "github.com/pzaino/thecrowler/pkg/config"
-
- "github.com/lib/pq"
)
var (
config cfg.Config
)
-// removeSite removes a site from the database along with its associated entries in other tables.
-// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
-// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
-// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
-// If any error occurs during the process, the transaction is rolled back and the error is returned.
-func removeSite(db *sql.DB, siteURL string) error {
- // Start a transaction
- tx, err := db.Begin()
- if err != nil {
- return err
- }
-
- // Delete from Sources
- err = deleteFromSources(tx, siteURL)
- if err != nil {
- return err
- }
-
- // Find and delete associated entries in SearchIndex, MetaTags, and KeywordIndex
- err = deleteAssociatedEntries(tx, siteURL)
- if err != nil {
- return err
- }
-
- // Commit the transaction
- err = tx.Commit()
- if err != nil {
- return err
- }
-
- return nil
+// ConsoleResponse represents the structure of the response
+// returned by the console API (addSource/removeSOurce etc.).
+type ConsoleResponse struct {
+ Message string `json:"message"`
}
-func deleteFromSources(tx *sql.Tx, siteURL string) error {
- _, err := tx.Exec(`DELETE FROM Sources WHERE url = $1`, siteURL)
- if err != nil {
- rollbackTransaction(tx)
- return err
- }
- return nil
-}
+func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
+ var results ConsoleResponse
+ results.Message = "Failed to remove the source"
-func deleteAssociatedEntries(tx *sql.Tx, siteURL string) error {
- indexIDs, err := getAssociatedIndexIDs(tx, siteURL)
+ // First, get the source_id for the given URL to ensure it exists and to use in cascading deletes if necessary
+ var sourceID int64
+ err := tx.QueryRow("SELECT source_id FROM Sources WHERE url = $1", sourceURL).Scan(&sourceID)
if err != nil {
- rollbackTransaction(tx)
- return err
+ return results, err
}
- err = deleteFromSearchIndex(tx, siteURL)
+ // Proceed with deleting the source using the obtained source_id
+ _, err = tx.Exec("DELETE FROM Sources WHERE source_id = $1", sourceID)
if err != nil {
- rollbackTransaction(tx)
- return err
- }
-
- err = deleteFromMetaTags(tx, indexIDs)
- if err != nil {
- rollbackTransaction(tx)
- return err
+ err2 := tx.Rollback() // Rollback in case of error
+ if err2 != nil {
+ return ConsoleResponse{Message: "Failed to delete source"}, err2
+ }
+ return ConsoleResponse{Message: "Failed to delete source and related data"}, err
}
-
- err = deleteFromKeywordIndex(tx, indexIDs, siteURL)
+ _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();")
if err != nil {
- rollbackTransaction(tx)
- return err
+ err2 := tx.Rollback() // Rollback in case of error
+ if err2 != nil {
+ return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err2
+ }
+ return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err
}
-
- return nil
-}
-
-func getAssociatedIndexIDs(tx *sql.Tx, siteURL string) ([]int, error) {
- var indexIDs []int
- rows, err := tx.Query(`SELECT index_id FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
+ _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();")
if err != nil {
- return nil, err
- }
- defer rows.Close()
-
- for rows.Next() {
- var indexID int
- if err := rows.Scan(&indexID); err != nil {
- return nil, err
+ err2 := tx.Rollback() // Rollback in case of error
+ if err2 != nil {
+ return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err2
}
- indexIDs = append(indexIDs, indexID)
+ return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err
}
- return indexIDs, nil
+ results.Message = "Source and related data removed successfully"
+ return results, nil
}
-func deleteFromSearchIndex(tx *sql.Tx, siteURL string) error {
- _, err := tx.Exec(`DELETE FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
+// removeSite removes a site from the database along with its associated entries in other tables.
+// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
+// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
+// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
+// If any error occurs during the process, the transaction is rolled back and the error is returned.
+func removeSite(db *sql.DB, siteURL string) error {
+ // Start a transaction
+ tx, err := db.Begin()
if err != nil {
return err
}
- return nil
-}
-func deleteFromMetaTags(tx *sql.Tx, indexIDs []int) error {
- for _, id := range indexIDs {
- _, err := tx.Exec(`DELETE FROM MetaTags WHERE index_id = $1`, id)
- if err != nil {
- return err
- }
- }
- return nil
-}
-
-func deleteFromKeywordIndex(tx *sql.Tx, indexIDs []int, siteURL string) error {
- _, err := tx.Exec(`
- DELETE FROM KeywordIndex
- WHERE index_id = ANY($1)
- AND NOT EXISTS (
- SELECT 1 FROM SearchIndex
- WHERE index_id = KeywordIndex.index_id
- AND source_id != (SELECT source_id FROM Sources WHERE url = $2)
- )`, pq.Array(indexIDs), siteURL)
+ _, err = removeSource(tx, siteURL)
if err != nil {
return err
}
- return nil
-}
-func rollbackTransaction(tx *sql.Tx) {
- err := tx.Rollback()
+ // Commit the transaction
+ err = tx.Commit()
if err != nil {
- log.Printf("Error rolling back transaction: %v\n", err)
+ return err
}
+
+ return nil
}
func main() {
diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index d65b94bb..cbb03df0 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -606,11 +606,11 @@ func insertOrUpdateWebObjects(tx *sql.Tx, indexID int64, pageInfo PageInfo) erro
// Step 1: Insert into WebObjects
err := tx.QueryRow(`
- INSERT INTO WebObjects (object_url, object_hash, object_content)
+ INSERT INTO WebObjects (object_html, object_hash, object_content)
VALUES ($1, $2, $3)
ON CONFLICT (object_hash) DO UPDATE
SET object_content = EXCLUDED.object_content
- RETURNING object_id;`, pageInfo.URL, hash, pageInfo.BodyText).Scan(&objID)
+ RETURNING object_id;`, pageInfo.HTML, hash, pageInfo.BodyText).Scan(&objID)
if err != nil {
return err
}
@@ -714,8 +714,11 @@ func insertHTTPInfo(tx *sql.Tx, indexID int64, httpInfo *httpi.HTTPDetails) erro
// It takes a transaction, index ID, and a map of meta tags as parameters.
// Each meta tag is inserted into the MetaTags table with the corresponding index ID, name, and content.
// Returns an error if there was a problem executing the SQL statement.
-func insertMetaTags(tx *sql.Tx, indexID int64, metaTags map[string]string) error {
- for name, content := range metaTags {
+func insertMetaTags(tx *sql.Tx, indexID int64, metaTags []MetaTag) error {
+ for _, metatag := range metaTags {
+ name := metatag.Name
+ content := metatag.Content
+
var metatagID int64
// Try to find the metatag ID first
@@ -884,6 +887,10 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
title, _ := webPage.Title()
summary := doc.Find("meta[name=description]").AttrOr("content", "")
bodyText := doc.Find("body").Text()
+ // transform tabs into spaces
+ bodyText = strings.Replace(bodyText, "\t", " ", -1)
+ // remove excessive spaces in bodyText
+ bodyText = strings.Join(strings.Fields(bodyText), " ")
metaTags := extractMetaTags(doc)
@@ -893,6 +900,7 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
Title: title,
Summary: summary,
BodyText: bodyText,
+ HTML: htmlContent,
MetaTags: metaTags,
DetectedLang: detectLang(webPage),
DetectedType: inferDocumentType(currentURL),
@@ -946,14 +954,14 @@ func inferDocumentType(url string) string {
// extractMetaTags is a function that extracts meta tags from a goquery.Document.
// It iterates over each "meta" element in the document and retrieves the "name" and "content" attributes.
-// The extracted meta tags are stored in a map[string]string, where the "name" attribute is the key and the "content" attribute is the value.
-// The function returns the map of extracted meta tags.
-func extractMetaTags(doc *goquery.Document) map[string]string {
- metaTags := make(map[string]string)
+// The extracted meta tags are stored in a []MetaTag, where the "name" attribute is the key and the "content" attribute is the value.
+// The function returns the slice of extracted meta tags.
+func extractMetaTags(doc *goquery.Document) []MetaTag {
+ var metaTags []MetaTag
doc.Find("meta").Each(func(_ int, s *goquery.Selection) {
if name, exists := s.Attr("name"); exists {
content, _ := s.Attr("content")
- metaTags[name] = content
+ metaTags = append(metaTags, MetaTag{Name: name, Content: content})
}
})
return metaTags
@@ -984,7 +992,7 @@ func extractLinks(htmlContent string) []string {
doc.Find("a").Each(func(index int, item *goquery.Selection) {
linkTag := item
link, _ := linkTag.Attr("href")
- link = strings.TrimSpace(link)
+ link = normalizeURL(link, 0)
if link != "" && IsValidURL(link) {
links = append(links, link)
}
@@ -992,6 +1000,22 @@ func extractLinks(htmlContent string) []string {
return links
}
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+/* flags:
+ 1: Convert to lowercase
+*/
+func normalizeURL(url string, flags uint) string {
+ // Trim spaces
+ url = strings.TrimSpace(url)
+ // Trim trailing slash
+ url = strings.TrimRight(url, "/")
+ // Convert to lowercase
+ if flags&1 == 1 {
+ url = strings.ToLower(url)
+ }
+ return url
+}
+
// isExternalLink checks if the link is external (aka outside the Source domain)
// isExternalLink checks if linkURL is external to sourceURL based on domainLevel.
func isExternalLink(sourceURL, linkURL string, domainLevel int) bool {
diff --git a/pkg/crawler/keywords.go b/pkg/crawler/keywords.go
index d9382376..ca93f347 100644
--- a/pkg/crawler/keywords.go
+++ b/pkg/crawler/keywords.go
@@ -196,8 +196,8 @@ func extractKeywords(pageInfo PageInfo) []string {
content := normalizeText(doc.Text())
// Extract from meta tags (keywords and description)
- keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "keywords")...)
- keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "description")...)
+ keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "keywords")...)
+ keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "description")...)
// Extract from main content
contentKeywords := extractContentKeywords(content)
diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go
index 8ec6d157..b3f90e4a 100644
--- a/pkg/crawler/keywords_test.go
+++ b/pkg/crawler/keywords_test.go
@@ -128,7 +128,7 @@ func TestExtractKeywords(t *testing.T) {
keywords := make(map[string]string)
keywords["keywords"] = testData
pageInfo := PageInfo{
- MetaTags: keywords,
+ Keywords: keywords,
}
tests := []struct {
diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go
index 654cb9de..ad398a78 100644
--- a/pkg/crawler/types.go
+++ b/pkg/crawler/types.go
@@ -30,6 +30,12 @@ type SeleniumInstance struct {
Config cfg.Selenium
}
+// MetaTag represents a single meta tag, including its name and content.
+type MetaTag struct {
+ Name string
+ Content string
+}
+
// PageInfo represents the information of a web page.
type PageInfo struct {
URL string // The URL of the web page.
@@ -37,7 +43,9 @@ type PageInfo struct {
Title string // The title of the web page.
Summary string // A summary of the web page content.
BodyText string // The main body text of the web page.
- MetaTags map[string]string // The meta tags of the web page.
+ HTML string // The HTML content of the web page.
+ MetaTags []MetaTag // The meta tags of the web page.
+ Keywords map[string]string // The keywords of the web page.
DetectedType string // The detected document type of the web page.
DetectedLang string // The detected language of the web page.
NetInfo *neti.NetInfo // The network information of the web page.
diff --git a/pkg/database/postgresql-setup-v1.2.pgsql b/pkg/database/postgresql-setup-v1.2.pgsql
index 27ad9b32..c35ab9dd 100644
--- a/pkg/database/postgresql-setup-v1.2.pgsql
+++ b/pkg/database/postgresql-setup-v1.2.pgsql
@@ -96,11 +96,11 @@ CREATE TABLE IF NOT EXISTS WebObjects (
object_id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL,
last_updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
- object_url TEXT NOT NULL, -- The original URL where the object was found
object_link TEXT NOT NULL DEFAULT 'db', -- The link to where the object is stored if not in the DB
object_type VARCHAR(255) NOT NULL DEFAULT 'text/html', -- The type of the object, for fast searches
object_hash VARCHAR(64) UNIQUE NOT NULL, -- SHA256 hash of the object for fast comparison and uniqueness
- object_content TEXT -- The actual content of the object, nullable if stored externally
+ object_content TEXT, -- The actual content of the object, nullable if stored externally
+ object_html TEXT -- The HTML content of the object, nullable if stored externally
);
-- MetaTags table stores the meta tags from the SearchIndex
@@ -502,17 +502,6 @@ BEGIN
END
$$;
--- Creates an index for the WebObjects object_url column
-DO $$
-BEGIN
- -- Check if the index already exists
- IF NOT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = 'idx_webobjects_object_url') THEN
- -- Create the index if it doesn't exist
- CREATE INDEX idx_webobjects_object_url ON WebObjects(object_url text_pattern_ops);
- END IF;
-END
-$$;
-
-- Creates an index for the WebObjects object_link column
DO $$
BEGIN
diff --git a/services/api/console.go b/services/api/console.go
index 96fa2934..2af42384 100644
--- a/services/api/console.go
+++ b/services/api/console.go
@@ -29,11 +29,14 @@ func performAddSource(query string, qType int) (ConsoleResponse, error) {
var sqlQuery string
var sqlParams addSourceRequest
if qType == 1 {
- sqlParams.URL = query
+ sqlParams.URL = normalizeURL(query)
sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')"
} else {
// extract the parameters from the query
extractAddSourceParams(query, &sqlParams)
+ // Normalize the URL
+ sqlParams.URL = normalizeURL(sqlParams.URL)
+ // Prepare the SQL query
sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status, restricted, disabled, flags, config) VALUES ($1, NULL, $2, $3, $4, $5, $6)"
}
@@ -189,7 +192,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
}
return ConsoleResponse{Message: "Failed to delete source and related data"}, err
}
- _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();", sourceID)
+ _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();")
if err != nil {
err2 := tx.Rollback() // Rollback in case of error
if err2 != nil {
@@ -197,7 +200,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
}
return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err
}
- _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();", sourceID)
+ _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();")
if err != nil {
err2 := tx.Rollback() // Rollback in case of error
if err2 != nil {
diff --git a/services/api/helpers.go b/services/api/helpers.go
index 218f552d..7aab6b62 100644
--- a/services/api/helpers.go
+++ b/services/api/helpers.go
@@ -5,6 +5,7 @@ import (
"fmt"
"io"
"net/http"
+ "strings"
cmn "github.com/pzaino/thecrowler/pkg/common"
)
@@ -49,3 +50,14 @@ func getQType(expr bool) int {
}
return 0
}
+
+// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
+func normalizeURL(url string) string {
+ // Trim spaces
+ url = strings.TrimSpace(url)
+ // Trim trailing slash
+ url = strings.TrimRight(url, "/")
+ // Convert to lowercase
+ url = strings.ToLower(url)
+ return url
+}