From bf039c6adf5d0d06dbc14fce5e8b1c1c8f1b46bf Mon Sep 17 00:00:00 2001 From: nobody Date: Sat, 17 Apr 2021 13:16:03 +0000 Subject: [PATCH] Refactor package - Upgrade to Go 1.16 - Build binary for Apple Silicon - Change package path from `github.com/wabarc/archive.org/pkg` to `github.com/wabarc/archive.org` - Minor improvements - Do not redirect for wayback - Add playback func to search archived URLs - Add more tests --- .github/workflows/release.yml | 23 ++-- Makefile | 12 +- cmd/archive.org/ia.go | 54 +++++++++ cmd/ia.go | 28 ----- pkg/doc.go => doc.go | 0 go.mod | 5 +- go.sum | 2 + ia.go | 212 ++++++++++++++++++++++++++++++++++ ia_test.go | 87 ++++++++++++++ main.go | 7 -- main_test.go | 21 ---- pkg/http.go | 120 ------------------- pkg/ia.go | 44 ------- pkg/ia_test.go | 28 ----- version.go | 13 +-- 15 files changed, 375 insertions(+), 281 deletions(-) create mode 100644 cmd/archive.org/ia.go delete mode 100644 cmd/ia.go rename pkg/doc.go => doc.go (100%) create mode 100644 ia.go create mode 100644 ia_test.go delete mode 100644 main.go delete mode 100644 main_test.go delete mode 100644 pkg/http.go delete mode 100644 pkg/ia.go delete mode 100644 pkg/ia_test.go diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index ee6b63f..c77921e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -44,6 +44,8 @@ jobs: arch: ppc64 - os: linux arch: ppc64le + - os: darwin + arch: arm64 exclude: - os: darwin arch: 386 @@ -68,7 +70,7 @@ jobs: - name: Set up Go 1.x uses: actions/setup-go@v2 with: - go-version: ^1.15 + go-version: ^1.16 - name: Build fat binary id: builder @@ -128,24 +130,19 @@ jobs: path: archive-org # Put files to archive.org directory - name: Create Release - uses: actions/create-release@v1 + uses: softprops/action-gh-release@v1 + if: startsWith(github.ref, 'refs/tags/') env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token with: - tag_name: ${{ github.ref }} - release_name: Release ${{ github.ref }} body: | + See [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/${{ github.sha }}/CHANGELOG.md). + **Digests in this release:** ``` ${{ needs.checksum.outputs.digest }} ``` - draft: false - prerelease: true - - - name: Upload release assets - uses: fnkr/github-action-ghr@v1 - if: startsWith(github.ref, 'refs/tags/') - env: - GHR_PATH: archive-org/ - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + draft: true + files: | + archive-org/* diff --git a/Makefile b/Makefile index df61470..7684c51 100644 --- a/Makefile +++ b/Makefile @@ -4,9 +4,7 @@ export GOPROXY = https://proxy.golang.org NAME = archive.org BINDIR ?= ./build/binary PACKDIR ?= ./build/package -LDFLAGS := $(shell echo "-X 'archive.org/version.Version=`git describe --tags --abbrev=0`'") -LDFLAGS := $(shell echo "${LDFLAGS} -X 'archive.org/version.Commit=`git rev-parse --short HEAD`'") -LDFLAGS := $(shell echo "${LDFLAGS} -X 'archive.org/version.BuildDate=`date +%FT%T%z`'") +LDFLAGS := $(shell echo "-X 'github.com/wabarc/archive.org.Version=`git describe --tags --abbrev=0`'") GOBUILD ?= CGO_ENABLED=0 go build -trimpath --ldflags "-s -w ${LDFLAGS} -buildid=" -v VERSION ?= $(shell git describe --tags `git rev-list --tags --max-count=1` | sed -e 's/v//g') GOFILES ?= $(wildcard ./cmd/archive.org/*.go) @@ -15,6 +13,7 @@ PACKAGES ?= $(shell go list ./...) PLATFORM_LIST = \ darwin-amd64 \ + darwin-arm64 \ linux-386 \ linux-amd64 \ linux-armv5 \ @@ -42,6 +41,7 @@ WINDOWS_ARCH_LIST = \ .PHONY: \ darwin-386 \ darwin-amd64 \ + darwin-arm64 \ linux-386 \ linux-amd64 \ linux-armv5 \ @@ -68,11 +68,7 @@ WINDOWS_ARCH_LIST = \ releases \ clean \ test \ - fmt \ - rpm \ - debian \ - debian-packages \ - docker-image + fmt darwin-386: GOARCH=386 GOOS=darwin $(GOBUILD) -o $(BINDIR)/$(NAME)-$@ $(GOFILES) diff --git a/cmd/archive.org/ia.go b/cmd/archive.org/ia.go new file mode 100644 index 0000000..3fcede9 --- /dev/null +++ b/cmd/archive.org/ia.go @@ -0,0 +1,54 @@ +package main + +import ( + "flag" + "fmt" + "os" + + "github.com/wabarc/archive.org" +) + +func main() { + var ( + playback bool + version bool + ) + + const playbackHelp = "Search archived URL" + const versionHelp = "Show version" + + flag.BoolVar(&playback, "playback", false, playbackHelp) + flag.BoolVar(&playback, "p", false, playbackHelp) + flag.BoolVar(&version, "version", false, versionHelp) + flag.BoolVar(&version, "v", false, versionHelp) + flag.Parse() + + if version { + fmt.Println(ia.Version) + os.Exit(0) + } + + args := flag.Args() + if len(args) < 1 { + flag.Usage() + e := os.Args[0] + fmt.Printf(" %s url [url]\n\n", e) + fmt.Printf("example:\n %s https://example.com https://example.org\n\n", e) + os.Exit(1) + } + + wbrc := &ia.Archiver{} + + if playback { + collects, _ := wbrc.Playback(args) + for orig, dest := range collects { + fmt.Println(orig, "=>", dest) + } + os.Exit(0) + } + + saved, _ := wbrc.Wayback(args) + for orig, dest := range saved { + fmt.Println(orig, "=>", dest) + } +} diff --git a/cmd/ia.go b/cmd/ia.go deleted file mode 100644 index 9001e71..0000000 --- a/cmd/ia.go +++ /dev/null @@ -1,28 +0,0 @@ -package ia - -import ( - "flag" - "fmt" - "os" - - "github.com/wabarc/archive.org/pkg" -) - -func Run() { - flag.Parse() - - args := flag.Args() - if len(args) < 1 { - flag.Usage() - e := os.Args[0] - fmt.Printf(" %s url [url]\n\n", e) - fmt.Printf("example:\n %s https://www.google.com https://www.bbc.co.uk/\n\n", e) - os.Exit(1) - } - - wbrc := &ia.Archiver{} - saved, _ := wbrc.Wayback(args) - for orig, dest := range saved { - fmt.Println(orig, "=>", dest) - } -} diff --git a/pkg/doc.go b/doc.go similarity index 100% rename from pkg/doc.go rename to doc.go diff --git a/go.mod b/go.mod index 398543b..782d5d9 100644 --- a/go.mod +++ b/go.mod @@ -2,4 +2,7 @@ module github.com/wabarc/archive.org go 1.15 -require github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 +require ( + github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 + github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee +) diff --git a/go.sum b/go.sum index d68b041..e61e7e0 100644 --- a/go.sum +++ b/go.sum @@ -1,2 +1,4 @@ github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616 h1:wZ5HtpmZAVUq0Im5Sm92ycJrTeLJk5lB/Kvh55Rd+Ps= github.com/wabarc/helper v0.0.0-20210127120855-10af37cc2616/go.mod h1:N9P4r7Rn46p4nkWtXV6ztN3p5ACVnp++bgfwjTqSxQ8= +github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee h1:MMIp++7eem2CI1jIYDoPByMwXeZAjsFo2ciBNtvhB80= +github.com/wabarc/logger v0.0.0-20210417045349-d0d82e8e99ee/go.mod h1:4uYr9fnQaQoDk1ttTzLnSB3lZm3i/vrJwN8EZIB2YuI= diff --git a/ia.go b/ia.go new file mode 100644 index 0000000..1e4d2a0 --- /dev/null +++ b/ia.go @@ -0,0 +1,212 @@ +package ia + +import ( + "encoding/json" + "fmt" + "io/ioutil" + "net/http" + "net/url" + "regexp" + "sync" + "time" + + "github.com/wabarc/helper" + "github.com/wabarc/logger" +) + +type Archiver struct { + Client *http.Client +} + +const ( + userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" + timeout = 120 * time.Second +) + +var ( + host = "archive.org" + dest = "https://web." + host + base = "https://web.archive.org/save/" + + endpoint = "https://archive.org/wayback/available" +) + +// Wayback is the handle of saving webpages to archive.org +func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { + collects, results := make(map[string]string), make(map[string]string) + for _, link := range links { + if !helper.IsURL(link) { + logger.Info(link + " is invalid url.") + continue + } + collects[link] = link + } + if wbrc.Client == nil { + wbrc.Client = &http.Client{ + Timeout: timeout, + CheckRedirect: noRedirect, + } + } + + ch := make(chan string, len(collects)) + defer close(ch) + + var mu sync.Mutex + var wg sync.WaitGroup + for _, link := range collects { + wg.Add(1) + go func(link string) { + wbrc.archive(link, ch) + mu.Lock() + results[link] = <-ch + mu.Unlock() + wg.Done() + }(link) + } + wg.Wait() + + if len(results) == 0 { + return results, fmt.Errorf("No results") + } + + return results, nil +} + +// Playback handle searching archived webpages from archive.is +func (wbrc *Archiver) Playback(links []string) (map[string]string, error) { + collects, results := make(map[string]string), make(map[string]string) + for _, link := range links { + if !helper.IsURL(link) { + logger.Info(link + " is invalid url.") + continue + } + collects[link] = link + } + + if wbrc.Client == nil { + wbrc.Client = &http.Client{ + Timeout: timeout, + CheckRedirect: noRedirect, + } + } + + ch := make(chan string, len(collects)) + defer close(ch) + + var mu sync.Mutex + var wg sync.WaitGroup + for _, link := range collects { + wg.Add(1) + go func(link string) { + mu.Lock() + wbrc.search(link, ch) + results[link] = <-ch + mu.Unlock() + wg.Done() + }(link) + } + wg.Wait() + + if len(results) == 0 { + return results, fmt.Errorf("No results") + } + + return results, nil +} +func (wbrc *Archiver) archive(url string, ch chan<- string) { + req, err := http.NewRequest("GET", base+url, nil) + req.Header.Add("User-Agent", userAgent) + resp, err := wbrc.Client.Do(req) + if err != nil { + ch <- fmt.Sprint(err) + return + } + defer resp.Body.Close() + + var loc string + loc = resp.Header.Get("Content-Location") + + if len(loc) > 0 { + ch <- fmt.Sprintf("%v%v", dest, loc) + return + } + + loc = resp.Header.Get("Location") + if len(loc) > 0 { + ch <- fmt.Sprintf("%v%v", dest, loc) + return + } + + links := resp.Header.Get("Link") + re := regexp.MustCompile(`(?m)http[s]?:\/\/web\.archive\.org/web/[-a-zA-Z0-9@:%_\+.~#?&//=]*`) + if match := re.FindAllString(links, -1); len(match) > 0 { + loc = match[len(match)-1] + ch <- fmt.Sprintf("%v", loc) + return + } + + loc = resp.Request.URL.String() + if match := re.FindAllString(loc, -1); len(match) > 0 { + ch <- fmt.Sprintf("%v", loc) + return + } + + got := wbrc.latest(url) + + // HTTP 509 Bandwidth Limit Exceeded + if resp.StatusCode == 509 { + ch <- fmt.Sprint(got) + return + } + + if resp.StatusCode != 200 { + ch <- fmt.Sprint(got) + return + } + + ch <- fmt.Sprintf("The Internet Archive: %v %v for url: %v", resp.StatusCode, http.StatusText(resp.StatusCode), base+url) +} + +func (wbrc *Archiver) search(url string, ch chan<- string) { + ch <- wbrc.latest(url) +} + +func (wbrc *Archiver) latest(s string) string { + // https://web.archive.org/*/https://example.org + u := fmt.Sprintf("%s/*/%s", dest, s) + + if _, err := url.Parse(s); err != nil { + return u + } + + uri := endpoint + "?url=" + s + resp, err := wbrc.Client.Get(uri) + if err != nil { + return u + } + defer resp.Body.Close() + + data, err := ioutil.ReadAll(resp.Body) + if err != nil { + return u + } + + var dat map[string]interface{} + if err := json.Unmarshal(data, &dat); err != nil { + return u + } + + if archived, ok := dat["archived_snapshots"].(map[string]interface{}); ok { + if closest, ok := archived["closest"].(map[string]interface{}); ok { + if closest["available"].(bool) && closest["status"] == "200" { + return closest["url"].(string) + } + } + } + + return u +} + +func noRedirect(req *http.Request, via []*http.Request) error { + return http.ErrUseLastResponse +} diff --git a/ia_test.go b/ia_test.go new file mode 100644 index 0000000..8d143d0 --- /dev/null +++ b/ia_test.go @@ -0,0 +1,87 @@ +package ia + +import ( + "testing" +) + +func TestWayback(t *testing.T) { + var got map[string]string + + tests := []struct { + name string + urls []string + got int + }{ + { + name: "Without URLs", + urls: []string{}, + got: 0, + }, + { + name: "Has one invalid URL", + urls: []string{"foo bar", "https://example.com/"}, + got: 1, + }, + { + name: "URLs full matches", + urls: []string{"https://example.com/", "https://example.org/"}, + got: 2, + }, + } + + wbrc := &Archiver{} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, _ = wbrc.Wayback(test.urls) + if len(got) != test.got { + t.Errorf("got = %d; want %d", len(got), test.got) + } + for orig, dest := range got { + if testing.Verbose() { + t.Log(orig, "=>", dest) + } + } + }) + } +} + +func TestPlayback(t *testing.T) { + var got map[string]string + + tests := []struct { + name string + urls []string + got int + }{ + { + name: "Without URLs", + urls: []string{}, + got: 0, + }, + { + name: "Has one invalid URL", + urls: []string{"foo bar", "https://example.com/"}, + got: 1, + }, + { + name: "URLs full matches", + urls: []string{"https://example.com/", "https://example.org/"}, + got: 2, + }, + } + + wbrc := &Archiver{} + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got, _ = wbrc.Playback(test.urls) + if len(got) != test.got { + t.Errorf("got = %d; want %d", len(got), test.got) + } + for orig, dest := range got { + if testing.Verbose() { + t.Log(orig, "=>", dest) + } + } + }) + } +} diff --git a/main.go b/main.go deleted file mode 100644 index 23815d6..0000000 --- a/main.go +++ /dev/null @@ -1,7 +0,0 @@ -package main - -import "github.com/wabarc/archive.org/cmd" - -func main() { - ia.Run() -} diff --git a/main_test.go b/main_test.go deleted file mode 100644 index 6d8fed2..0000000 --- a/main_test.go +++ /dev/null @@ -1,21 +0,0 @@ -package main - -import ( - "strings" - "testing" - - "github.com/wabarc/archive.org/pkg" -) - -func TestWayback(t *testing.T) { - url := "https://www.google.com" - links := []string{url} - wbrc := &ia.Archiver{} - got, _ := wbrc.Wayback(links) - for _, dest := range got { - if strings.Contains(dest, url) == false || strings.Contains(dest, "archive.org") == false { - t.Error(got) - t.Fail() - } - } -} diff --git a/pkg/http.go b/pkg/http.go deleted file mode 100644 index 2c4613d..0000000 --- a/pkg/http.go +++ /dev/null @@ -1,120 +0,0 @@ -package ia - -import ( - "encoding/json" - "fmt" - "io/ioutil" - "net/http" - "net/url" - "regexp" - "time" -) - -type Archiver struct { -} - -const ( - userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" - timeout = 120 * time.Second -) - -var ( - host = "archive.org" - dest = "https://web." + host - base = "https://web.archive.org/save/" - - endpoint = "https://archive.org/wayback/available" -) - -func (wbrc *Archiver) fetch(url string, ch chan<- string) { - client := &http.Client{ - Timeout: timeout, - } - req, err := http.NewRequest("GET", base+url, nil) - req.Header.Add("User-Agent", userAgent) - resp, err := client.Do(req) - if err != nil { - ch <- fmt.Sprint(err) - return - } - defer resp.Body.Close() - - var loc string - loc = resp.Header.Get("Content-Location") - - if len(loc) > 0 { - ch <- fmt.Sprintf("%v%v", dest, loc) - return - } - - loc = resp.Header.Get("Location") - if len(loc) > 0 { - ch <- fmt.Sprintf("%v%v", dest, loc) - return - } - - links := resp.Header.Get("Link") - re := regexp.MustCompile(`(?m)http[s]?:\/\/web\.archive\.org/web/[-a-zA-Z0-9@:%_\+.~#?&//=]*`) - if match := re.FindAllString(links, -1); len(match) > 0 { - loc = match[len(match)-1] - ch <- fmt.Sprintf("%v", loc) - return - } - - loc = resp.Request.URL.String() - if match := re.FindAllString(loc, -1); len(match) > 0 { - ch <- fmt.Sprintf("%v", loc) - return - } - - got := latest(url) - - // HTTP 509 Bandwidth Limit Exceeded - if resp.StatusCode == 509 { - ch <- fmt.Sprint(got) - return - } - - if resp.StatusCode != 200 { - ch <- fmt.Sprint(got) - return - } - - ch <- fmt.Sprintf("The Internet Archive: %v %v for url: %v", resp.StatusCode, http.StatusText(resp.StatusCode), base+url) -} - -func latest(s string) string { - // https://web.archive.org/*/https://example.org - u := fmt.Sprintf("%s/*/%s", dest, s) - - if _, err := url.Parse(s); err != nil { - return u - } - - endpoint += "?url=" + s - resp, err := http.Get(endpoint) - if err != nil { - return u - } - defer resp.Body.Close() - - data, err := ioutil.ReadAll(resp.Body) - if err != nil { - return u - } - - var dat map[string]interface{} - if err := json.Unmarshal(data, &dat); err != nil { - return u - } - - if archived, ok := dat["archived_snapshots"].(map[string]interface{}); ok { - if closest, ok := archived["closest"].(map[string]interface{}); ok { - if closest["available"].(bool) { - return closest["url"].(string) - } - } - } - - return u -} diff --git a/pkg/ia.go b/pkg/ia.go deleted file mode 100644 index 84616f2..0000000 --- a/pkg/ia.go +++ /dev/null @@ -1,44 +0,0 @@ -package ia - -import ( - "fmt" - "log" - "sync" - - "github.com/wabarc/helper" -) - -// Wayback is the handle of saving webpages to archive.org -func (wbrc *Archiver) Wayback(links []string) (map[string]string, error) { - collect, results := make(map[string]string), make(map[string]string) - for _, link := range links { - if !helper.IsURL(link) { - log.Print(link + " is invalid url.") - continue - } - collect[link] = link - } - - ch := make(chan string, len(collect)) - defer close(ch) - - var mu sync.Mutex - var wg sync.WaitGroup - for link := range collect { - wg.Add(1) - go func(link string) { - wbrc.fetch(link, ch) - mu.Lock() - results[link] = <-ch - mu.Unlock() - wg.Done() - }(link) - } - wg.Wait() - - if len(results) == 0 { - return results, fmt.Errorf("No results") - } - - return results, nil -} diff --git a/pkg/ia_test.go b/pkg/ia_test.go deleted file mode 100644 index 45ad995..0000000 --- a/pkg/ia_test.go +++ /dev/null @@ -1,28 +0,0 @@ -package ia - -import ( - "testing" -) - -func TestWayback(t *testing.T) { - var ( - links []string - got map[string]string - ) - - wbrc := &Archiver{} - got, _ = wbrc.Wayback(links) - if len(got) != 0 { - t.Errorf("got = %d; want 0", len(got)) - } - - links = []string{"https://www.bbc.com/", "https://www.google.com/"} - got, _ = wbrc.Wayback(links) - if len(got) == 0 { - t.Errorf("got = %d; want not equal 0", len(got)) - } - - for orig, dest := range got { - t.Log(orig, "=>", dest) - } -} diff --git a/version.go b/version.go index 7cae1a2..946e98e 100644 --- a/version.go +++ b/version.go @@ -1,12 +1,3 @@ -package main +package ia -import "fmt" - -var ( - version = "1.0.0" - date = "unknown" -) - -func init() { - fmt.Printf("version: %s\ndate: %s\n\n", version, date) -} +var Version = "dev"