Skip to content

Commit

Permalink
Merge pull request #125 from pzaino/develop
Browse files Browse the repository at this point in the history
Improved quality for meta tags extraction and fixed a bug in removeSource command
  • Loading branch information
pzaino authored Mar 1, 2024
2 parents 693bf75 + ec3dac6 commit 3f62ab6
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 133 deletions.
15 changes: 15 additions & 0 deletions cmd/addSource/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"log"
"strings"

cfg "github.com/pzaino/thecrowler/pkg/config"

Expand All @@ -35,6 +36,9 @@ func insertWebsite(db *sql.DB, url string) error {
// SQL statement to insert a new website
stmt := `INSERT INTO Sources (url, last_crawled_at, status) VALUES ($1, NULL, 'pending')`

// Normalize the URL
url = normalizeURL(url)

// Execute the SQL statement
_, err := db.Exec(stmt, url)
if err != nil {
Expand All @@ -45,6 +49,17 @@ func insertWebsite(db *sql.DB, url string) error {
return nil
}

// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
func normalizeURL(url string) string {
// Trim spaces
url = strings.TrimSpace(url)
// Trim trailing slash
url = strings.TrimRight(url, "/")
// Convert to lowercase
url = strings.ToLower(url)
return url
}

func main() {
configFile := flag.String("config", "config.yaml", "Path to the configuration file")
url := flag.String("url", "", "URL of the website to add")
Expand Down
148 changes: 45 additions & 103 deletions cmd/removeSource/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,141 +23,83 @@ import (
"log"

cfg "github.com/pzaino/thecrowler/pkg/config"

"github.com/lib/pq"
)

var (
config cfg.Config
)

// removeSite removes a site from the database along with its associated entries in other tables.
// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
// If any error occurs during the process, the transaction is rolled back and the error is returned.
func removeSite(db *sql.DB, siteURL string) error {
// Start a transaction
tx, err := db.Begin()
if err != nil {
return err
}

// Delete from Sources
err = deleteFromSources(tx, siteURL)
if err != nil {
return err
}

// Find and delete associated entries in SearchIndex, MetaTags, and KeywordIndex
err = deleteAssociatedEntries(tx, siteURL)
if err != nil {
return err
}

// Commit the transaction
err = tx.Commit()
if err != nil {
return err
}

return nil
// ConsoleResponse represents the structure of the response
// returned by the console API (addSource/removeSOurce etc.).
type ConsoleResponse struct {
Message string `json:"message"`
}

func deleteFromSources(tx *sql.Tx, siteURL string) error {
_, err := tx.Exec(`DELETE FROM Sources WHERE url = $1`, siteURL)
if err != nil {
rollbackTransaction(tx)
return err
}
return nil
}
func removeSource(tx *sql.Tx, sourceURL string) (ConsoleResponse, error) {
var results ConsoleResponse
results.Message = "Failed to remove the source"

func deleteAssociatedEntries(tx *sql.Tx, siteURL string) error {
indexIDs, err := getAssociatedIndexIDs(tx, siteURL)
// First, get the source_id for the given URL to ensure it exists and to use in cascading deletes if necessary
var sourceID int64
err := tx.QueryRow("SELECT source_id FROM Sources WHERE url = $1", sourceURL).Scan(&sourceID)
if err != nil {
rollbackTransaction(tx)
return err
return results, err
}

err = deleteFromSearchIndex(tx, siteURL)
// Proceed with deleting the source using the obtained source_id
_, err = tx.Exec("DELETE FROM Sources WHERE source_id = $1", sourceID)
if err != nil {
rollbackTransaction(tx)
return err
}

err = deleteFromMetaTags(tx, indexIDs)
if err != nil {
rollbackTransaction(tx)
return err
err2 := tx.Rollback() // Rollback in case of error
if err2 != nil {
return ConsoleResponse{Message: "Failed to delete source"}, err2
}
return ConsoleResponse{Message: "Failed to delete source and related data"}, err
}

err = deleteFromKeywordIndex(tx, indexIDs, siteURL)
_, err = tx.Exec("SELECT cleanup_orphaned_httpinfo();")
if err != nil {
rollbackTransaction(tx)
return err
err2 := tx.Rollback() // Rollback in case of error
if err2 != nil {
return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err2
}
return ConsoleResponse{Message: "Failed to cleanup orphaned httpinfo"}, err
}

return nil
}

func getAssociatedIndexIDs(tx *sql.Tx, siteURL string) ([]int, error) {
var indexIDs []int
rows, err := tx.Query(`SELECT index_id FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
_, err = tx.Exec("SELECT cleanup_orphaned_netinfo();")
if err != nil {
return nil, err
}
defer rows.Close()

for rows.Next() {
var indexID int
if err := rows.Scan(&indexID); err != nil {
return nil, err
err2 := tx.Rollback() // Rollback in case of error
if err2 != nil {
return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err2
}
indexIDs = append(indexIDs, indexID)
return ConsoleResponse{Message: "Failed to cleanup orphaned netinfo"}, err
}

return indexIDs, nil
results.Message = "Source and related data removed successfully"
return results, nil
}

func deleteFromSearchIndex(tx *sql.Tx, siteURL string) error {
_, err := tx.Exec(`DELETE FROM SearchIndex WHERE source_id = (SELECT source_id FROM Sources WHERE url = $1)`, siteURL)
// removeSite removes a site from the database along with its associated entries in other tables.
// It takes a *sql.DB as the database connection and a siteURL string as the URL of the site to be removed.
// It starts a transaction, deletes the site from the Sources table, and then deletes the associated entries
// in the SearchIndex, MetaTags, and KeywordIndex tables. Finally, it commits the transaction.
// If any error occurs during the process, the transaction is rolled back and the error is returned.
func removeSite(db *sql.DB, siteURL string) error {
// Start a transaction
tx, err := db.Begin()
if err != nil {
return err
}
return nil
}

func deleteFromMetaTags(tx *sql.Tx, indexIDs []int) error {
for _, id := range indexIDs {
_, err := tx.Exec(`DELETE FROM MetaTags WHERE index_id = $1`, id)
if err != nil {
return err
}
}
return nil
}

func deleteFromKeywordIndex(tx *sql.Tx, indexIDs []int, siteURL string) error {
_, err := tx.Exec(`
DELETE FROM KeywordIndex
WHERE index_id = ANY($1)
AND NOT EXISTS (
SELECT 1 FROM SearchIndex
WHERE index_id = KeywordIndex.index_id
AND source_id != (SELECT source_id FROM Sources WHERE url = $2)
)`, pq.Array(indexIDs), siteURL)
_, err = removeSource(tx, siteURL)
if err != nil {
return err
}
return nil
}

func rollbackTransaction(tx *sql.Tx) {
err := tx.Rollback()
// Commit the transaction
err = tx.Commit()
if err != nil {
log.Printf("Error rolling back transaction: %v\n", err)
return err
}

return nil
}

func main() {
Expand Down
44 changes: 34 additions & 10 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -606,11 +606,11 @@ func insertOrUpdateWebObjects(tx *sql.Tx, indexID int64, pageInfo PageInfo) erro

// Step 1: Insert into WebObjects
err := tx.QueryRow(`
INSERT INTO WebObjects (object_url, object_hash, object_content)
INSERT INTO WebObjects (object_html, object_hash, object_content)
VALUES ($1, $2, $3)
ON CONFLICT (object_hash) DO UPDATE
SET object_content = EXCLUDED.object_content
RETURNING object_id;`, pageInfo.URL, hash, pageInfo.BodyText).Scan(&objID)
RETURNING object_id;`, pageInfo.HTML, hash, pageInfo.BodyText).Scan(&objID)
if err != nil {
return err
}
Expand Down Expand Up @@ -714,8 +714,11 @@ func insertHTTPInfo(tx *sql.Tx, indexID int64, httpInfo *httpi.HTTPDetails) erro
// It takes a transaction, index ID, and a map of meta tags as parameters.
// Each meta tag is inserted into the MetaTags table with the corresponding index ID, name, and content.
// Returns an error if there was a problem executing the SQL statement.
func insertMetaTags(tx *sql.Tx, indexID int64, metaTags map[string]string) error {
for name, content := range metaTags {
func insertMetaTags(tx *sql.Tx, indexID int64, metaTags []MetaTag) error {
for _, metatag := range metaTags {
name := metatag.Name
content := metatag.Content

var metatagID int64

// Try to find the metatag ID first
Expand Down Expand Up @@ -884,6 +887,10 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
title, _ := webPage.Title()
summary := doc.Find("meta[name=description]").AttrOr("content", "")
bodyText := doc.Find("body").Text()
// transform tabs into spaces
bodyText = strings.Replace(bodyText, "\t", " ", -1)
// remove excessive spaces in bodyText
bodyText = strings.Join(strings.Fields(bodyText), " ")

metaTags := extractMetaTags(doc)

Expand All @@ -893,6 +900,7 @@ func extractPageInfo(webPage selenium.WebDriver, ctx *processContext) PageInfo {
Title: title,
Summary: summary,
BodyText: bodyText,
HTML: htmlContent,
MetaTags: metaTags,
DetectedLang: detectLang(webPage),
DetectedType: inferDocumentType(currentURL),
Expand Down Expand Up @@ -946,14 +954,14 @@ func inferDocumentType(url string) string {

// extractMetaTags is a function that extracts meta tags from a goquery.Document.
// It iterates over each "meta" element in the document and retrieves the "name" and "content" attributes.
// The extracted meta tags are stored in a map[string]string, where the "name" attribute is the key and the "content" attribute is the value.
// The function returns the map of extracted meta tags.
func extractMetaTags(doc *goquery.Document) map[string]string {
metaTags := make(map[string]string)
// The extracted meta tags are stored in a []MetaTag, where the "name" attribute is the key and the "content" attribute is the value.
// The function returns the slice of extracted meta tags.
func extractMetaTags(doc *goquery.Document) []MetaTag {
var metaTags []MetaTag
doc.Find("meta").Each(func(_ int, s *goquery.Selection) {
if name, exists := s.Attr("name"); exists {
content, _ := s.Attr("content")
metaTags[name] = content
metaTags = append(metaTags, MetaTag{Name: name, Content: content})
}
})
return metaTags
Expand Down Expand Up @@ -984,14 +992,30 @@ func extractLinks(htmlContent string) []string {
doc.Find("a").Each(func(index int, item *goquery.Selection) {
linkTag := item
link, _ := linkTag.Attr("href")
link = strings.TrimSpace(link)
link = normalizeURL(link, 0)
if link != "" && IsValidURL(link) {
links = append(links, link)
}
})
return links
}

// normalizeURL normalizes a URL by trimming trailing slashes and converting it to lowercase.
/* flags:
1: Convert to lowercase
*/
func normalizeURL(url string, flags uint) string {
// Trim spaces
url = strings.TrimSpace(url)
// Trim trailing slash
url = strings.TrimRight(url, "/")
// Convert to lowercase
if flags&1 == 1 {
url = strings.ToLower(url)
}
return url
}

// isExternalLink checks if the link is external (aka outside the Source domain)
// isExternalLink checks if linkURL is external to sourceURL based on domainLevel.
func isExternalLink(sourceURL, linkURL string, domainLevel int) bool {
Expand Down
4 changes: 2 additions & 2 deletions pkg/crawler/keywords.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,8 @@ func extractKeywords(pageInfo PageInfo) []string {
content := normalizeText(doc.Text())

// Extract from meta tags (keywords and description)
keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "keywords")...)
keywords = append(keywords, extractFromMetaTag(pageInfo.MetaTags, "description")...)
keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "keywords")...)
keywords = append(keywords, extractFromMetaTag(pageInfo.Keywords, "description")...)

// Extract from main content
contentKeywords := extractContentKeywords(content)
Expand Down
2 changes: 1 addition & 1 deletion pkg/crawler/keywords_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ func TestExtractKeywords(t *testing.T) {
keywords := make(map[string]string)
keywords["keywords"] = testData
pageInfo := PageInfo{
MetaTags: keywords,
Keywords: keywords,
}

tests := []struct {
Expand Down
10 changes: 9 additions & 1 deletion pkg/crawler/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,22 @@ type SeleniumInstance struct {
Config cfg.Selenium
}

// MetaTag represents a single meta tag, including its name and content.
type MetaTag struct {
Name string
Content string
}

// PageInfo represents the information of a web page.
type PageInfo struct {
URL string // The URL of the web page.
sourceID int64 // The ID of the source.
Title string // The title of the web page.
Summary string // A summary of the web page content.
BodyText string // The main body text of the web page.
MetaTags map[string]string // The meta tags of the web page.
HTML string // The HTML content of the web page.
MetaTags []MetaTag // The meta tags of the web page.
Keywords map[string]string // The keywords of the web page.
DetectedType string // The detected document type of the web page.
DetectedLang string // The detected language of the web page.
NetInfo *neti.NetInfo // The network information of the web page.
Expand Down
15 changes: 2 additions & 13 deletions pkg/database/postgresql-setup-v1.2.pgsql
Original file line number Diff line number Diff line change
Expand Up @@ -96,11 +96,11 @@ CREATE TABLE IF NOT EXISTS WebObjects (
object_id BIGSERIAL PRIMARY KEY,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP NOT NULL,
last_updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
object_url TEXT NOT NULL, -- The original URL where the object was found
object_link TEXT NOT NULL DEFAULT 'db', -- The link to where the object is stored if not in the DB
object_type VARCHAR(255) NOT NULL DEFAULT 'text/html', -- The type of the object, for fast searches
object_hash VARCHAR(64) UNIQUE NOT NULL, -- SHA256 hash of the object for fast comparison and uniqueness
object_content TEXT -- The actual content of the object, nullable if stored externally
object_content TEXT, -- The actual content of the object, nullable if stored externally
object_html TEXT -- The HTML content of the object, nullable if stored externally
);

-- MetaTags table stores the meta tags from the SearchIndex
Expand Down Expand Up @@ -502,17 +502,6 @@ BEGIN
END
$$;

-- Creates an index for the WebObjects object_url column
DO $$
BEGIN
-- Check if the index already exists
IF NOT EXISTS (SELECT 1 FROM pg_indexes WHERE indexname = 'idx_webobjects_object_url') THEN
-- Create the index if it doesn't exist
CREATE INDEX idx_webobjects_object_url ON WebObjects(object_url text_pattern_ops);
END IF;
END
$$;

-- Creates an index for the WebObjects object_link column
DO $$
BEGIN
Expand Down
Loading

0 comments on commit 3f62ab6

Please sign in to comment.