Skip to content

Commit

Permalink
feat: add robots scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Nov 12, 2023
1 parent fd15e03 commit 8bc41f7
Show file tree
Hide file tree
Showing 3 changed files with 115 additions and 1 deletion.
107 changes: 107 additions & 0 deletions cmd/wp-go-static/commands/robots.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package commands

import (
"fmt"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"strings"

"wp-go-static/internal/config"

"github.com/spf13/cobra"
"github.com/spf13/pflag"
"github.com/spf13/viper"
)

// RobotsCmd ...
var RobotsCmd = &cobra.Command{
Use: "robots",
Short: "Create robots from the Wordpress website",
RunE: robotsCmdF,
}

const (
bindFlagRobotsPrefix = "robots"
)

func init() {
// Define command-line flags
RobotsCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files")
RobotsCmd.PersistentFlags().String("url", "", "URL to scrape")
RobotsCmd.PersistentFlags().String("replace-url", "", "Replace with a specific url")
RobotsCmd.PersistentFlags().String("file", "robots.txt", "Output robots file name")

// Bind command-line flags to Viper
RobotsCmd.PersistentFlags().VisitAll(func(flag *pflag.Flag) {
bindFlag := fmt.Sprintf("%s.%s", bindFlagRobotsPrefix, flag.Name)
viper.BindPFlag(bindFlag, RobotsCmd.PersistentFlags().Lookup(flag.Name))
})

RootCmd.AddCommand(RobotsCmd)
}

func robotsCmdF(command *cobra.Command, args []string) error {
config := config.Config{}
viper.Unmarshal(&config)

// Fetch the robots.txt from the provided URL
resp, err := http.Get(config.Robots.URL)
if err != nil {
return err
}
defer resp.Body.Close()

// Read the response body into a string
body, err := io.ReadAll(resp.Body)
if err != nil {
return err
}

modifiedBody := string(body)

if config.Robots.ReplaceURL != "" {
// Perform the search and replace operation
modifiedBody = strings.ReplaceAll(string(body), config.Robots.URL, config.Robots.ReplaceURL)
}

if config.Robots.ReplaceURL != "" {
currentURL, _ := url.Parse(config.Robots.URL)
host := currentURL.Host
optionList := []string{
"http://" + host,
"http:\\/\\/" + host,
"https://" + host,
"https:\\/\\/" + host,
}

for _, option := range optionList {
// Find the matching string
if !strings.Contains(modifiedBody, option) {
continue
}

// Replace the string
modifiedBody = strings.ReplaceAll(modifiedBody, option, config.Robots.ReplaceURL)
}
}

// Print the output
fmt.Println(modifiedBody)

// Create a new file
out, err := os.Create(filepath.Join(config.Robots.Dir, config.Robots.File))
if err != nil {
return err
}
defer out.Close()

// Write the modified string to the new file
if _, err := out.WriteString(modifiedBody); err != nil {
return err
}

return nil
}
1 change: 0 additions & 1 deletion cmd/wp-go-static/commands/scrape.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ func scrapeCmdF(command *cobra.Command, args []string) error {
})

urlsToVisit := []string{
"robots.txt",
"favicon.ico",
}

Expand Down
8 changes: 8 additions & 0 deletions internal/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package config
type Config struct {
Scrape ScrapeConfig `mapstructure:"scrape"`
Sitemap SitemapConfig `mapstructure:"sitemap"`
Robots RobotsConfig `mapstructure:"robots"`
}

type SitemapConfig struct {
Expand All @@ -22,3 +23,10 @@ type ScrapeConfig struct {
Images bool `mapstructure:"images"`
CheckHead bool `mapstructure:"check-head"`
}

type RobotsConfig struct {
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
ReplaceURL string `mapstructure:"replace-url"`
File string `mapstructure:"file"`
}

0 comments on commit 8bc41f7

Please sign in to comment.