Skip to content

Commit

Permalink
feat: add sitemap
Browse files Browse the repository at this point in the history
  • Loading branch information
marcotuna committed Sep 1, 2023
1 parent 8a5d347 commit 66867ba
Show file tree
Hide file tree
Showing 2 changed files with 346 additions and 0 deletions.
105 changes: 105 additions & 0 deletions cmd/wp-go-static/commands/sitemap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
package commands

import (
"fmt"
"log"
"net/url"
"strings"

goSitemap "wp-go-static/pkg/sitemap"

"github.com/spf13/cobra"
"github.com/spf13/viper"
)

type SitemapConfig struct {
Dir string `mapstructure:"dir"`
URL string `mapstructure:"url"`
ReplaceURL string `mapstructure:"replace-url"`
File string `mapstructure:"file"`
}

// SitemapCmd ...
var SitemapCmd = &cobra.Command{
Use: "sitemap",
Short: "Create sitemap from the Wordpress website",
RunE: sitemapCmdF,
}

func init() {
// Define command-line flags
SitemapCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files")
SitemapCmd.PersistentFlags().String("url", "", "URL to scrape")
SitemapCmd.PersistentFlags().String("replace-url", "", "Replace with a specific url")
SitemapCmd.PersistentFlags().String("file", "sitemap.xml", "Output sitemap file name")
SitemapCmd.MarkFlagRequired("url")

// Bind command-line flags to Viper
err := viper.BindPFlags(SitemapCmd.PersistentFlags())
if err != nil {
log.Fatal(err)
}

RootCmd.AddCommand(SitemapCmd)
}

func sitemapCmdF(command *cobra.Command, args []string) error {
sitemapConfig := SitemapConfig{}
viper.Unmarshal(&sitemapConfig)

smap, err := goSitemap.Get(sitemapConfig.URL, nil)
if err != nil {
fmt.Println(err)
}

for i := range smap.URL {
// Replace the URL with the url from the replace-url argument
// Only with the URL part, persist the URL path and query
if sitemapConfig.ReplaceURL != "" {
currentURL, _ := url.Parse(sitemapConfig.URL)

optionList := []string{
fmt.Sprintf(`http://%s`, currentURL.Host),
fmt.Sprintf(`http:\/\/%s`, currentURL.Host),
fmt.Sprintf(`https://%s`, currentURL.Host),
fmt.Sprintf(`https:\/\/%s`, currentURL.Host),
}

for _, option := range optionList {
if i >= len(smap.URL) {
fmt.Println("Index out of range for smap.URL")
break
}
smap.URL[i].Loc = strings.ReplaceAll(string(smap.URL[i].Loc), option, sitemapConfig.ReplaceURL)

// for j := range smap.Image {
// if i >= len(smap.URL) {
// fmt.Println("Index out of range for smap.URL")
// break
// }
// if j >= len(smap.URL[i].Image) {
// fmt.Println("Index out of range for smap.URL[i].Image")
// break
// }
// smap.URL[i].Image[j].Loc = strings.ReplaceAll(string(smap.URL[i].Image[j].Loc), option, sitemapConfig.ReplaceURL)
// }
}
}
}

// Print the Sitemap
printSmap, err := smap.Print()
if err != nil {
return err
}

fmt.Printf("%s\n", printSmap)

// Write the Sitemap to a file
if sitemapConfig.File != "" {
fmt.Printf("Writing sitemap to %s/%s\n", sitemapConfig.Dir, sitemapConfig.File)
return smap.Save(sitemapConfig.Dir, sitemapConfig.File)
}

return nil
}
241 changes: 241 additions & 0 deletions pkg/sitemap/sitemap.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
package sitemap

import (
"encoding/xml"
"fmt"
"io"
"net/http"
"os"
"time"
)

// Index is a structure of <sitemapindex>
type Index struct {
XMLName xml.Name `xml:"sitemapindex"`
Sitemap []parts `xml:"sitemap"`
}

// parts is a structure of <sitemap> in <sitemapindex>
type parts struct {
Loc string `xml:"loc"`
LastMod string `xml:"lastmod"`
}

// Sitemap is a structure of <sitemap>
type Sitemap struct {
// Xsi string `xml:"xsi,attr"`
// Image string `xml:"image,attr"`
// SchemaLocation string `xml:"schemaLocation,attr"`
// Xmlns string `xml:"xmlns,attr"`
XMLName xml.Name `xml:"urlset"`
URL []URL `xml:"url"`
}

// URL is a structure of <url> in <sitemap>
type URL struct {
Loc string `xml:"loc"`
LastMod string `xml:"lastmod,omitempty"`
ChangeFreq string `xml:"changefreq,omitempty"`
Priority float32 `xml:"priority,omitempty"`
// Image []Image `xml:"image,omitempty"`
}

// Image is a structure of <image> in <url>
type Image struct {
Loc string `xml:"loc,omitempty"`
Title string `xml:"title,omitempty"`
Caption string `xml:"caption,omitempty"`
GeoLoc string `xml:"geolocation,omitempty"`
License string `xml:"license,omitempty"`
}

var (
// fetch is page acquisition function
fetch = func(URL string, options interface{}) ([]byte, error) {
var body []byte

res, err := http.Get(URL)
if err != nil {
return body, err
}
defer res.Body.Close()

return io.ReadAll(res.Body)
}

// Time interval to be used in Index.get
interval = time.Second
)

/*
Get is fetch and parse sitemap.xml/sitemapindex.xml
If sitemap.xml or sitemapindex.xml has some problems, This function return error.
・When sitemap.xml/sitemapindex.xml could not retrieved.
・When sitemap.xml/sitemapindex.xml is empty.
・When sitemap.xml/sitemapindex.xml has format problems.
・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved.
・When sitemapindex.xml contains a sitemap.xml that is empty
・When sitemapindex.xml contains a sitemap.xml that has format problems.
If you want to ignore these errors, use the ForceGet function.
*/
func Get(URL string, options interface{}) (Sitemap, error) {
data, err := fetch(URL, options)
if err != nil {
return Sitemap{}, err
}

idx, idxErr := ParseIndex(data)
smap, smapErr := Parse(data)

if idxErr != nil && smapErr != nil {
if idxErr != nil {
err = idxErr
} else {
err = smapErr
}
return Sitemap{}, fmt.Errorf("URL is not a sitemap or sitemapindex: %v", err)
} else if idxErr != nil {
return smap, nil
}

smap, err = idx.get(options, false)
if err != nil {
return Sitemap{}, err
}

return smap, nil
}

/*
ForceGet is fetch and parse sitemap.xml/sitemapindex.xml.
The difference with the Get function is that it ignores some errors.
Errors to Ignore:
・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved.
・When sitemapindex.xml contains a sitemap.xml that is empty
・When sitemapindex.xml contains a sitemap.xml that has format problems.
Errors not to Ignore:
・When sitemap.xml/sitemapindex.xml could not retrieved.
・When sitemap.xml/sitemapindex.xml is empty.
・When sitemap.xml/sitemapindex.xml has format problems.
If you want **not** to ignore some errors, use the Get function.
*/
func ForceGet(URL string, options interface{}) (Sitemap, error) {
data, err := fetch(URL, options)
if err != nil {
return Sitemap{}, err
}

idx, idxErr := ParseIndex(data)
smap, smapErr := Parse(data)

if idxErr != nil && smapErr != nil {
if idxErr != nil {
err = idxErr
} else {
err = smapErr
}
return Sitemap{}, fmt.Errorf("URL is not a sitemap or sitemapindex: %v", err)
} else if idxErr != nil {
return smap, nil
}

smap, err = idx.get(options, true)
if err != nil {
return Sitemap{}, err
}

return smap, nil
}

// Get Sitemap data from sitemapindex file
func (idx *Index) get(options interface{}, ignoreErr bool) (Sitemap, error) {
var smap Sitemap

for _, s := range idx.Sitemap {
time.Sleep(interval)
data, err := fetch(s.Loc, options)
if !ignoreErr && err != nil {
return smap, fmt.Errorf("failed to retrieve %s in sitemapindex.xml: %v", s.Loc, err)
}

err = xml.Unmarshal(data, &smap)
if !ignoreErr && err != nil {
return smap, fmt.Errorf("failed to parse %s in sitemapindex.xml: %v", s.Loc, err)
}
}

return smap, nil
}

// Parse create Sitemap data from text
func Parse(data []byte) (Sitemap, error) {
var smap Sitemap
if len(data) == 0 {
return smap, fmt.Errorf("sitemap.xml is empty")
}

err := xml.Unmarshal(data, &smap)
return smap, err
}

// ParseIndex create Index data from text
func ParseIndex(data []byte) (Index, error) {
var idx Index
if len(data) == 0 {
return idx, fmt.Errorf("sitemapindex.xml is empty")
}

err := xml.Unmarshal(data, &idx)
return idx, err
}

// SetInterval change Time interval to be used in Index.get
func SetInterval(time time.Duration) {
interval = time
}

// SetFetch change fetch closure
func SetFetch(f func(URL string, options interface{}) ([]byte, error)) {
fetch = f
}

// Print shows the sitemap from Sitemap struct
func (smap *Sitemap) Print() ([]byte, error) {
return xml.MarshalIndent(smap, "", " ")
}

// Save creates the sitemap from Sitemap struct and save it to file
func (smap *Sitemap) Save(dir, file string) error {
data, err := smap.Print()
if err != nil {
return err
}

// Add the xml header
data = append([]byte(xml.Header), data...)

// Create directory if it does not exist
if _, err := os.Stat(dir); os.IsNotExist(err) {
os.Mkdir(dir, 0755)
}

_, err = os.Create(fmt.Sprintf("%s/%s", dir, file))
if err != nil {
return err
}

err = os.WriteFile(fmt.Sprintf("%s/%s", dir, file), data, 0644)
if err != nil {
return err
}

return nil
}

0 comments on commit 66867ba

Please sign in to comment.