-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
346 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
package commands | ||
|
||
import ( | ||
"fmt" | ||
"log" | ||
"net/url" | ||
"strings" | ||
|
||
goSitemap "wp-go-static/pkg/sitemap" | ||
|
||
"github.com/spf13/cobra" | ||
"github.com/spf13/viper" | ||
) | ||
|
||
type SitemapConfig struct { | ||
Dir string `mapstructure:"dir"` | ||
URL string `mapstructure:"url"` | ||
ReplaceURL string `mapstructure:"replace-url"` | ||
File string `mapstructure:"file"` | ||
} | ||
|
||
// SitemapCmd ... | ||
var SitemapCmd = &cobra.Command{ | ||
Use: "sitemap", | ||
Short: "Create sitemap from the Wordpress website", | ||
RunE: sitemapCmdF, | ||
} | ||
|
||
func init() { | ||
// Define command-line flags | ||
SitemapCmd.PersistentFlags().String("dir", "dump", "directory to save downloaded files") | ||
SitemapCmd.PersistentFlags().String("url", "", "URL to scrape") | ||
SitemapCmd.PersistentFlags().String("replace-url", "", "Replace with a specific url") | ||
SitemapCmd.PersistentFlags().String("file", "sitemap.xml", "Output sitemap file name") | ||
SitemapCmd.MarkFlagRequired("url") | ||
|
||
// Bind command-line flags to Viper | ||
err := viper.BindPFlags(SitemapCmd.PersistentFlags()) | ||
if err != nil { | ||
log.Fatal(err) | ||
} | ||
|
||
RootCmd.AddCommand(SitemapCmd) | ||
} | ||
|
||
func sitemapCmdF(command *cobra.Command, args []string) error { | ||
sitemapConfig := SitemapConfig{} | ||
viper.Unmarshal(&sitemapConfig) | ||
|
||
smap, err := goSitemap.Get(sitemapConfig.URL, nil) | ||
if err != nil { | ||
fmt.Println(err) | ||
} | ||
|
||
for i := range smap.URL { | ||
// Replace the URL with the url from the replace-url argument | ||
// Only with the URL part, persist the URL path and query | ||
if sitemapConfig.ReplaceURL != "" { | ||
currentURL, _ := url.Parse(sitemapConfig.URL) | ||
|
||
optionList := []string{ | ||
fmt.Sprintf(`http://%s`, currentURL.Host), | ||
fmt.Sprintf(`http:\/\/%s`, currentURL.Host), | ||
fmt.Sprintf(`https://%s`, currentURL.Host), | ||
fmt.Sprintf(`https:\/\/%s`, currentURL.Host), | ||
} | ||
|
||
for _, option := range optionList { | ||
if i >= len(smap.URL) { | ||
fmt.Println("Index out of range for smap.URL") | ||
break | ||
} | ||
smap.URL[i].Loc = strings.ReplaceAll(string(smap.URL[i].Loc), option, sitemapConfig.ReplaceURL) | ||
|
||
// for j := range smap.Image { | ||
// if i >= len(smap.URL) { | ||
// fmt.Println("Index out of range for smap.URL") | ||
// break | ||
// } | ||
// if j >= len(smap.URL[i].Image) { | ||
// fmt.Println("Index out of range for smap.URL[i].Image") | ||
// break | ||
// } | ||
// smap.URL[i].Image[j].Loc = strings.ReplaceAll(string(smap.URL[i].Image[j].Loc), option, sitemapConfig.ReplaceURL) | ||
// } | ||
} | ||
} | ||
} | ||
|
||
// Print the Sitemap | ||
printSmap, err := smap.Print() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
fmt.Printf("%s\n", printSmap) | ||
|
||
// Write the Sitemap to a file | ||
if sitemapConfig.File != "" { | ||
fmt.Printf("Writing sitemap to %s/%s\n", sitemapConfig.Dir, sitemapConfig.File) | ||
return smap.Save(sitemapConfig.Dir, sitemapConfig.File) | ||
} | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,241 @@ | ||
package sitemap | ||
|
||
import ( | ||
"encoding/xml" | ||
"fmt" | ||
"io" | ||
"net/http" | ||
"os" | ||
"time" | ||
) | ||
|
||
// Index is a structure of <sitemapindex> | ||
type Index struct { | ||
XMLName xml.Name `xml:"sitemapindex"` | ||
Sitemap []parts `xml:"sitemap"` | ||
} | ||
|
||
// parts is a structure of <sitemap> in <sitemapindex> | ||
type parts struct { | ||
Loc string `xml:"loc"` | ||
LastMod string `xml:"lastmod"` | ||
} | ||
|
||
// Sitemap is a structure of <sitemap> | ||
type Sitemap struct { | ||
// Xsi string `xml:"xsi,attr"` | ||
// Image string `xml:"image,attr"` | ||
// SchemaLocation string `xml:"schemaLocation,attr"` | ||
// Xmlns string `xml:"xmlns,attr"` | ||
XMLName xml.Name `xml:"urlset"` | ||
URL []URL `xml:"url"` | ||
} | ||
|
||
// URL is a structure of <url> in <sitemap> | ||
type URL struct { | ||
Loc string `xml:"loc"` | ||
LastMod string `xml:"lastmod,omitempty"` | ||
ChangeFreq string `xml:"changefreq,omitempty"` | ||
Priority float32 `xml:"priority,omitempty"` | ||
// Image []Image `xml:"image,omitempty"` | ||
} | ||
|
||
// Image is a structure of <image> in <url> | ||
type Image struct { | ||
Loc string `xml:"loc,omitempty"` | ||
Title string `xml:"title,omitempty"` | ||
Caption string `xml:"caption,omitempty"` | ||
GeoLoc string `xml:"geolocation,omitempty"` | ||
License string `xml:"license,omitempty"` | ||
} | ||
|
||
var ( | ||
// fetch is page acquisition function | ||
fetch = func(URL string, options interface{}) ([]byte, error) { | ||
var body []byte | ||
|
||
res, err := http.Get(URL) | ||
if err != nil { | ||
return body, err | ||
} | ||
defer res.Body.Close() | ||
|
||
return io.ReadAll(res.Body) | ||
} | ||
|
||
// Time interval to be used in Index.get | ||
interval = time.Second | ||
) | ||
|
||
/* | ||
Get is fetch and parse sitemap.xml/sitemapindex.xml | ||
If sitemap.xml or sitemapindex.xml has some problems, This function return error. | ||
・When sitemap.xml/sitemapindex.xml could not retrieved. | ||
・When sitemap.xml/sitemapindex.xml is empty. | ||
・When sitemap.xml/sitemapindex.xml has format problems. | ||
・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. | ||
・When sitemapindex.xml contains a sitemap.xml that is empty | ||
・When sitemapindex.xml contains a sitemap.xml that has format problems. | ||
If you want to ignore these errors, use the ForceGet function. | ||
*/ | ||
func Get(URL string, options interface{}) (Sitemap, error) { | ||
data, err := fetch(URL, options) | ||
if err != nil { | ||
return Sitemap{}, err | ||
} | ||
|
||
idx, idxErr := ParseIndex(data) | ||
smap, smapErr := Parse(data) | ||
|
||
if idxErr != nil && smapErr != nil { | ||
if idxErr != nil { | ||
err = idxErr | ||
} else { | ||
err = smapErr | ||
} | ||
return Sitemap{}, fmt.Errorf("URL is not a sitemap or sitemapindex: %v", err) | ||
} else if idxErr != nil { | ||
return smap, nil | ||
} | ||
|
||
smap, err = idx.get(options, false) | ||
if err != nil { | ||
return Sitemap{}, err | ||
} | ||
|
||
return smap, nil | ||
} | ||
|
||
/* | ||
ForceGet is fetch and parse sitemap.xml/sitemapindex.xml. | ||
The difference with the Get function is that it ignores some errors. | ||
Errors to Ignore: | ||
・When sitemapindex.xml contains a sitemap.xml URL that cannot be retrieved. | ||
・When sitemapindex.xml contains a sitemap.xml that is empty | ||
・When sitemapindex.xml contains a sitemap.xml that has format problems. | ||
Errors not to Ignore: | ||
・When sitemap.xml/sitemapindex.xml could not retrieved. | ||
・When sitemap.xml/sitemapindex.xml is empty. | ||
・When sitemap.xml/sitemapindex.xml has format problems. | ||
If you want **not** to ignore some errors, use the Get function. | ||
*/ | ||
func ForceGet(URL string, options interface{}) (Sitemap, error) { | ||
data, err := fetch(URL, options) | ||
if err != nil { | ||
return Sitemap{}, err | ||
} | ||
|
||
idx, idxErr := ParseIndex(data) | ||
smap, smapErr := Parse(data) | ||
|
||
if idxErr != nil && smapErr != nil { | ||
if idxErr != nil { | ||
err = idxErr | ||
} else { | ||
err = smapErr | ||
} | ||
return Sitemap{}, fmt.Errorf("URL is not a sitemap or sitemapindex: %v", err) | ||
} else if idxErr != nil { | ||
return smap, nil | ||
} | ||
|
||
smap, err = idx.get(options, true) | ||
if err != nil { | ||
return Sitemap{}, err | ||
} | ||
|
||
return smap, nil | ||
} | ||
|
||
// Get Sitemap data from sitemapindex file | ||
func (idx *Index) get(options interface{}, ignoreErr bool) (Sitemap, error) { | ||
var smap Sitemap | ||
|
||
for _, s := range idx.Sitemap { | ||
time.Sleep(interval) | ||
data, err := fetch(s.Loc, options) | ||
if !ignoreErr && err != nil { | ||
return smap, fmt.Errorf("failed to retrieve %s in sitemapindex.xml: %v", s.Loc, err) | ||
} | ||
|
||
err = xml.Unmarshal(data, &smap) | ||
if !ignoreErr && err != nil { | ||
return smap, fmt.Errorf("failed to parse %s in sitemapindex.xml: %v", s.Loc, err) | ||
} | ||
} | ||
|
||
return smap, nil | ||
} | ||
|
||
// Parse create Sitemap data from text | ||
func Parse(data []byte) (Sitemap, error) { | ||
var smap Sitemap | ||
if len(data) == 0 { | ||
return smap, fmt.Errorf("sitemap.xml is empty") | ||
} | ||
|
||
err := xml.Unmarshal(data, &smap) | ||
return smap, err | ||
} | ||
|
||
// ParseIndex create Index data from text | ||
func ParseIndex(data []byte) (Index, error) { | ||
var idx Index | ||
if len(data) == 0 { | ||
return idx, fmt.Errorf("sitemapindex.xml is empty") | ||
} | ||
|
||
err := xml.Unmarshal(data, &idx) | ||
return idx, err | ||
} | ||
|
||
// SetInterval change Time interval to be used in Index.get | ||
func SetInterval(time time.Duration) { | ||
interval = time | ||
} | ||
|
||
// SetFetch change fetch closure | ||
func SetFetch(f func(URL string, options interface{}) ([]byte, error)) { | ||
fetch = f | ||
} | ||
|
||
// Print shows the sitemap from Sitemap struct | ||
func (smap *Sitemap) Print() ([]byte, error) { | ||
return xml.MarshalIndent(smap, "", " ") | ||
} | ||
|
||
// Save creates the sitemap from Sitemap struct and save it to file | ||
func (smap *Sitemap) Save(dir, file string) error { | ||
data, err := smap.Print() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
// Add the xml header | ||
data = append([]byte(xml.Header), data...) | ||
|
||
// Create directory if it does not exist | ||
if _, err := os.Stat(dir); os.IsNotExist(err) { | ||
os.Mkdir(dir, 0755) | ||
} | ||
|
||
_, err = os.Create(fmt.Sprintf("%s/%s", dir, file)) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
err = os.WriteFile(fmt.Sprintf("%s/%s", dir, file), data, 0644) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
} |