From ec3dac6dbb65f2785b43b356a197eb006a28c082 Mon Sep 17 00:00:00 2001 From: Paolo Fabio Zaino Date: Fri, 1 Mar 2024 19:10:25 +0000 Subject: [PATCH] Improved quality for meta tags extraction and fixed a bug in removeSource command --- cmd/addSource/main.go | 15 +++ cmd/removeSource/main.go | 148 +++++++---------------- pkg/crawler/crawler.go | 44 +++++-- pkg/crawler/keywords.go | 4 +- pkg/crawler/keywords_test.go | 2 +- pkg/crawler/types.go | 10 +- pkg/database/postgresql-setup-v1.2.pgsql | 15 +-- services/api/console.go | 9 +- services/api/helpers.go | 12 ++ 9 files changed, 126 insertions(+), 133 deletions(-) diff --git a/cmd/addSource/main.go b/cmd/addSource/main.go index c07fa906..f4c3c7e8 100644 --- a/cmd/addSource/main.go +++ b/cmd/addSource/main.go @@ -21,6 +21,7 @@ import ( "flag" "fmt" "log" + "strings" cfg "github.com/pzaino/thecrowler/pkg/config" @@ -35,6 +36,9 @@ func insertWebsite(db *sql.DB, url string) error { // SQL statement to insert a new website stmt := `INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')` + // Normalize the URL + url = normalizeURL(url) + // Execute the SQL statement _, err := db.Exec(stmt, url) if err != nil { @@ -45,6 +49,17 @@ func insertWebsite(db *sql.DB, url string) error { return nil } +// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase. +func normalizeURL(url string) string { + // Trim spaces + url = strings.TrimSpace(url) + // Trim trailing slash + url = strings.TrimRight(url, "/") + // Convert to lowercase + url = strings.ToLower(url) + return url +} + func main() { configFile := flag.String("config", "config.yaml", "Path to the configuration file") url := flag.String("url", "", "URL of the website to add") diff --git a/cmd/removeSource/main.go b/cmd/removeSource/main.go index a83b5179..7b975849 100644 --- a/cmd/removeSource/main.go +++ b/cmd/removeSource/main.go @@ -23,141 +23,83 @@ import ( "log" cfg "github.com/pzaino/thecrowler/pkg/config" - - "github.com/lib/pq" ) var ( config cfg.Config ) -// removeSite removes a site from the database along with its associated entries in other tables. -// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed. -// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries -// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction. -// If any error occurs during the process, the transaction is rolled back and the error is returned. -func removeSite(db *sql.DB, siteURL string) error { - // Start a transaction - tx, err := db.Begin() - if err != nil { - return err - } - - // Delete from Sources - err = deleteFromSources(tx, siteURL) - if err != nil { - return err - } - - // Find and delete associated entries in SearchIndex, MetaTags, and KeywordIndex - err = deleteAssociatedEntries(tx, siteURL) - if err != nil { - return err - } - - // Commit the transaction - err = tx.Commit() - if err != nil { - return err - } - - return nil +// ConsoleResponse represents the structure of the response +// returned by the console API (addSource/removeSOurce etc.). +type ConsoleResponse struct { + Message string `json:"message"` } -func deleteFromSources(tx *sql.Tx, siteURL string) error { - _, err := tx.Exec(`DELETE FROM Sources WHERE url = $1`, siteURL) - if err != nil { - rollbackTransaction(tx) - return err - } - return nil -} +func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) { + var results ConsoleResponse + results.Message = "Failed to remove the source" -func deleteAssociatedEntries(tx *sql.Tx, siteURL string) error { - indexIDs, err := getAssociatedIndexIDs(tx, siteURL) + // First, get the source_id for the given URL to ensure it exists and to use in cascading deletes if necessary + var sourceID int64 + err := tx.QueryRow("SELECT source_id FROM Sources WHERE url = $1", sourceURL).Scan(&sourceID) if err != nil { - rollbackTransaction(tx) - return err + return results, err } - err = deleteFromSearchIndex(tx, siteURL) + // Proceed with deleting the source using the obtained source_id + _, err = tx.Exec("DELETE FROM Sources WHERE source_id = $1", sourceID) if err != nil { - rollbackTransaction(tx) - return err - } - - err = deleteFromMetaTags(tx, indexIDs) - if err != nil { - rollbackTransaction(tx) - return err + err2 := tx.Rollback() // Rollback in case of error + if err2 != nil { + return ConsoleResponse{Message: "Failed to delete source"}, err2 + } + return ConsoleResponse{Message: "Failed to delete source and related data"}, err } - - err = deleteFromKeywordIndex(tx, indexIDs, siteURL) + _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();") if err != nil { - rollbackTransaction(tx) - return err + err2 := tx.Rollback() // Rollback in case of error + if err2 != nil { + return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err2 + } + return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err } - - return nil -} - -func getAssociatedIndexIDs(tx *sql.Tx, siteURL string) ([]int, error) { - var indexIDs []int - rows, err := tx.Query(`SELECT index_id FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL) + _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();") if err != nil { - return nil, err - } - defer rows.Close() - - for rows.Next() { - var indexID int - if err := rows.Scan(&indexID); err != nil { - return nil, err + err2 := tx.Rollback() // Rollback in case of error + if err2 != nil { + return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err2 } - indexIDs = append(indexIDs, indexID) + return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err } - return indexIDs, nil + results.Message = "Source and related data removed successfully" + return results, nil } -func deleteFromSearchIndex(tx *sql.Tx, siteURL string) error { - _, err := tx.Exec(`DELETE FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL) +// removeSite removes a site from the database along with its associated entries in other tables. +// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed. +// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries +// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction. +// If any error occurs during the process, the transaction is rolled back and the error is returned. +func removeSite(db *sql.DB, siteURL string) error { + // Start a transaction + tx, err := db.Begin() if err != nil { return err } - return nil -} -func deleteFromMetaTags(tx *sql.Tx, indexIDs []int) error { - for _, id := range indexIDs { - _, err := tx.Exec(`DELETE FROM MetaTags WHERE index_id = $1`, id) - if err != nil { - return err - } - } - return nil -} - -func deleteFromKeywordIndex(tx *sql.Tx, indexIDs []int, siteURL string) error { - _, err := tx.Exec(` - DELETE FROM KeywordIndex - WHERE index_id = ANY($1) - AND NOT EXISTS ( - SELECT 1 FROM SearchIndex - WHERE index_id = KeywordIndex.index_id - AND source_id != (SELECT source_id FROM Sources WHERE url = $2) - )`, pq.Array(indexIDs), siteURL) + _, err = removeSource(tx, siteURL) if err != nil { return err } - return nil -} -func rollbackTransaction(tx *sql.Tx) { - err := tx.Rollback() + // Commit the transaction + err = tx.Commit() if err != nil { - log.Printf("Error rolling back transaction: %v\n", err) + return err } + + return nil } func main() { diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go index d65b94bb..cbb03df0 100644 --- a/pkg/crawler/crawler.go +++ b/pkg/crawler/crawler.go @@ -606,11 +606,11 @@ func insertOrUpdateWebObjects(tx *sql.Tx, indexID int64, pageInfo PageInfo) erro // Step 1: Insert into WebObjects err := tx.QueryRow(` - INSERT INTO WebObjects (object_url, object_hash, object_content) + INSERT INTO WebObjects (object_html, object_hash, object_content) VALUES ($1, $2, $3) ON CONFLICT (object_hash) DO UPDATE SET object_content = EXCLUDED.object_content - RETURNING object_id;`, pageInfo.URL, hash, pageInfo.BodyText).Scan(&objID) + RETURNING object_id;`, pageInfo.HTML, hash, pageInfo.BodyText).Scan(&objID) if err != nil { return err } @@ -714,8 +714,11 @@ func insertHTTPInfo(tx *sql.Tx, indexID int64, httpInfo *httpi.HTTPDetails) erro // It takes a transaction, index ID, and a map of meta tags as parameters. // Each meta tag is inserted into the MetaTags table with the corresponding index ID, name, and content. // Returns an error if there was a problem executing the SQL statement. -func insertMetaTags(tx *sql.Tx, indexID int64, metaTags map[string]string) error { - for name, content := range metaTags { +func insertMetaTags(tx *sql.Tx, indexID int64, metaTags []MetaTag) error { + for _, metatag := range metaTags { + name := metatag.Name + content := metatag.Content + var metatagID int64 // Try to find the metatag ID first @@ -884,6 +887,10 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo { title, _ := webPage.Title() summary := doc.Find("meta[name=description]").AttrOr("content", "") bodyText := doc.Find("body").Text() + // transform tabs into spaces + bodyText = strings.Replace(bodyText, "\t", " ", -1) + // remove excessive spaces in bodyText + bodyText = strings.Join(strings.Fields(bodyText), " ") metaTags := extractMetaTags(doc) @@ -893,6 +900,7 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo { Title: title, Summary: summary, BodyText: bodyText, + HTML: htmlContent, MetaTags: metaTags, DetectedLang: detectLang(webPage), DetectedType: inferDocumentType(currentURL), @@ -946,14 +954,14 @@ func inferDocumentType(url string) string { // extractMetaTags is a function that extracts meta tags from a goquery.Document. // It iterates over each "meta" element in the document and retrieves the "name" and "content" attributes. -// The extracted meta tags are stored in a map[string]string, where the "name" attribute is the key and the "content" attribute is the value. -// The function returns the map of extracted meta tags. -func extractMetaTags(doc *goquery.Document) map[string]string { - metaTags := make(map[string]string) +// The extracted meta tags are stored in a []MetaTag, where the "name" attribute is the key and the "content" attribute is the value. +// The function returns the slice of extracted meta tags. +func extractMetaTags(doc *goquery.Document) []MetaTag { + var metaTags []MetaTag doc.Find("meta").Each(func(_ int, s *goquery.Selection) { if name, exists := s.Attr("name"); exists { content, _ := s.Attr("content") - metaTags[name] = content + metaTags = append(metaTags, MetaTag{Name: name, Content: content}) } }) return metaTags @@ -984,7 +992,7 @@ func extractLinks(htmlContent string) []string { doc.Find("a").Each(func(index int, item *goquery.Selection) { linkTag := item link, _ := linkTag.Attr("href") - link = strings.TrimSpace(link) + link = normalizeURL(link, 0) if link != "" && IsValidURL(link) { links = append(links, link) } @@ -992,6 +1000,22 @@ func extractLinks(htmlContent string) []string { return links } +// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase. +/* flags: + 1: Convert to lowercase +*/ +func normalizeURL(url string, flags uint) string { + // Trim spaces + url = strings.TrimSpace(url) + // Trim trailing slash + url = strings.TrimRight(url, "/") + // Convert to lowercase + if flags&1 == 1 { + url = strings.ToLower(url) + } + return url +} + // isExternalLink checks if the link is external (aka outside the Source domain) // isExternalLink checks if linkURL is external to sourceURL based on domainLevel. func isExternalLink(sourceURL, linkURL string, domainLevel int) bool { diff --git a/pkg/crawler/keywords.go b/pkg/crawler/keywords.go index d9382376..ca93f347 100644 --- a/pkg/crawler/keywords.go +++ b/pkg/crawler/keywords.go @@ -196,8 +196,8 @@ func extractKeywords(pageInfo PageInfo) []string { content := normalizeText(doc.Text()) // Extract from meta tags (keywords and description) - keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "keywords")...) - keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "description")...) + keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "keywords")...) + keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "description")...) // Extract from main content contentKeywords := extractContentKeywords(content) diff --git a/pkg/crawler/keywords_test.go b/pkg/crawler/keywords_test.go index 8ec6d157..b3f90e4a 100644 --- a/pkg/crawler/keywords_test.go +++ b/pkg/crawler/keywords_test.go @@ -128,7 +128,7 @@ func TestExtractKeywords(t *testing.T) { keywords := make(map[string]string) keywords["keywords"] = testData pageInfo := PageInfo{ - MetaTags: keywords, + Keywords: keywords, } tests := []struct { diff --git a/pkg/crawler/types.go b/pkg/crawler/types.go index 654cb9de..ad398a78 100644 --- a/pkg/crawler/types.go +++ b/pkg/crawler/types.go @@ -30,6 +30,12 @@ type SeleniumInstance struct { Config cfg.Selenium } +// MetaTag represents a single meta tag, including its name and content. +type MetaTag struct { + Name string + Content string +} + // PageInfo represents the information of a web page. type PageInfo struct { URL string // The URL of the web page. @@ -37,7 +43,9 @@ type PageInfo struct { Title string // The title of the web page. Summary string // A summary of the web page content. BodyText string // The main body text of the web page. - MetaTags map[string]string // The meta tags of the web page. + HTML string // The HTML content of the web page. + MetaTags []MetaTag // The meta tags of the web page. + Keywords map[string]string // The keywords of the web page. DetectedType string // The detected document type of the web page. DetectedLang string // The detected language of the web page. NetInfo *neti.NetInfo // The network information of the web page. diff --git a/pkg/database/postgresql-setup-v1.2.pgsql b/pkg/database/postgresql-setup-v1.2.pgsql index 27ad9b32..c35ab9dd 100644 --- a/pkg/database/postgresql-setup-v1.2.pgsql +++ b/pkg/database/postgresql-setup-v1.2.pgsql @@ -96,11 +96,11 @@ CREATE TABLE IF NOT EXISTS WebObjects ( object_id BIGSERIAL PRIMARY KEY, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL, last_updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, - object_url TEXT NOT NULL, -- The original URL where the object was found object_link TEXT NOT NULL DEFAULT 'db', -- The link to where the object is stored if not in the DB object_type VARCHAR(255) NOT NULL DEFAULT 'text/html', -- The type of the object, for fast searches object_hash VARCHAR(64) UNIQUE NOT NULL, -- SHA256 hash of the object for fast comparison and uniqueness - object_content TEXT -- The actual content of the object, nullable if stored externally + object_content TEXT, -- The actual content of the object, nullable if stored externally + object_html TEXT -- The HTML content of the object, nullable if stored externally ); -- MetaTags table stores the meta tags from the SearchIndex @@ -502,17 +502,6 @@ BEGIN END $$; --- Creates an index for the WebObjects object_url column -DO $$ -BEGIN - -- Check if the index already exists - IF NOT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = 'idx_webobjects_object_url') THEN - -- Create the index if it doesn't exist - CREATE INDEX idx_webobjects_object_url ON WebObjects(object_url text_pattern_ops); - END IF; -END -$$; - -- Creates an index for the WebObjects object_link column DO $$ BEGIN diff --git a/services/api/console.go b/services/api/console.go index 96fa2934..2af42384 100644 --- a/services/api/console.go +++ b/services/api/console.go @@ -29,11 +29,14 @@ func performAddSource(query string, qType int) (ConsoleResponse, error) { var sqlQuery string var sqlParams addSourceRequest if qType == 1 { - sqlParams.URL = query + sqlParams.URL = normalizeURL(query) sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')" } else { // extract the parameters from the query extractAddSourceParams(query, &sqlParams) + // Normalize the URL + sqlParams.URL = normalizeURL(sqlParams.URL) + // Prepare the SQL query sqlQuery = "INSERT INTO Sources (url, last_crawled_at, status, restricted, disabled, flags, config) VALUES ($1, NULL, $2, $3, $4, $5, $6)" } @@ -189,7 +192,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) { } return ConsoleResponse{Message: "Failed to delete source and related data"}, err } - _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();", sourceID) + _, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();") if err != nil { err2 := tx.Rollback() // Rollback in case of error if err2 != nil { @@ -197,7 +200,7 @@ func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) { } return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err } - _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();", sourceID) + _, err = tx.Exec("SELECT cleanup_orphaned_netinfo();") if err != nil { err2 := tx.Rollback() // Rollback in case of error if err2 != nil { diff --git a/services/api/helpers.go b/services/api/helpers.go index 218f552d..7aab6b62 100644 --- a/services/api/helpers.go +++ b/services/api/helpers.go @@ -5,6 +5,7 @@ import ( "fmt" "io" "net/http" + "strings" cmn "github.com/pzaino/thecrowler/pkg/common" ) @@ -49,3 +50,14 @@ func getQType(expr bool) int { } return 0 } + +// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase. +func normalizeURL(url string) string { + // Trim spaces + url = strings.TrimSpace(url) + // Trim trailing slash + url = strings.TrimRight(url, "/") + // Convert to lowercase + url = strings.ToLower(url) + return url +}