From b6cfe234fdf139ece620a5ef42f10d17881768d9 Mon Sep 17 00:00:00 2001 From: mhmdiaa Date: Sat, 25 Sep 2021 03:31:29 +0200 Subject: [PATCH] Major rewrite - support more use cases, more intuitive usage, new name --- Dockerfile | 9 +- README.md | 106 +++++++++++-- examples/robots.json | 5 + examples/s3.json | 3 + go.mod | 3 + main.go | 370 +++++++++++++++++++++++++++++++++++++++++++ waybackunifier.go | 131 --------------- 7 files changed, 476 insertions(+), 151 deletions(-) create mode 100644 examples/robots.json create mode 100644 examples/s3.json create mode 100644 go.mod create mode 100644 main.go delete mode 100644 waybackunifier.go diff --git a/Dockerfile b/Dockerfile index 602232f..06c4927 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,7 @@ -FROM golang:1.8-onbuild -MAINTAINER Mohammed Diaa +FROM golang:1.17.1-alpine as build-env +RUN go get -v github.com/mhmdiaa/chronos -ENTRYPOINT ["app"] \ No newline at end of file +FROM alpine:3.14 +RUN apk add --no-cache bind-tools ca-certificates +COPY --from=build-env /go/bin/chronos /usr/local/bin/chronos +ENTRYPOINT ["chronos"] diff --git a/README.md b/README.md index 447bfef..1c14f07 100644 --- a/README.md +++ b/README.md @@ -1,26 +1,98 @@ -# WaybackUnifier +# Chronos -WaybackUnifier allows you to take a look at how a file has ever looked by aggregating all versions of this file, and creating a unified version that contains every line that has ever been in it. +Chronos (previously known as WaybackUnifier) extracts pieces of data from a web page's history. It can be used to create custom wordlists, search for secrets, find old endpoints, etc. -### Installation -Go is required. +--- + +## Installation +### From binary +Download a prebuilt binary https://github.com/mhmdiaa/chronos/releases/latest + +### From source +Use `go get` to download and install the latest version +``` +go get -u github.com/mhmdiaa/chronos +``` + +--- + +## Presets +Presets are predefined options (URL path, match regex, and extract regex) that can be used to organize and simplify different use cases. The preset definitions are stored in `~/.chronos` as JSON files + +``` +$ cat ~/.chronos/robots.json +{ + "path": "/robots.txt", + "match": "Disallow", + "extract": "(?:\\s)(/.*)" +} +$ chronos -pr robots -t example.com +$ # equivalent to... +$ chronos -p /robots.txt -m Disallow -e "(?:\\s)(/.*)" +``` + +--- + +## Example usage + +### Extract paths from robots.txt files and build a wordlist ``` -go get github.com/mhmdiaa/waybackunifier +$ chronos -t example.com -p /robots.txt -m Disallow -e "(?:\\s)(/.*)" -o robots_wordlist.txt ``` -This will download the code, compile it, and leave a `waybackunifier` binary in $GOPATH/bin. -### Syntax +### Save all versions of a web page locally and filter out a specifc status code +``` +$ chronos -t http://example.com/this_is_403_now_but_has_it_always_been_like_this_question_mark -fs 403 -od output +``` + +### Save URLs of all subdomains of example.com that were last seen in 2015 +``` +$ chronos -t *.example.com -u -to 2015 +``` + +### Run the S3 preset that extract AWS S3 URLs +``` +$ chronos -pr s3 -t example.com +``` + + +--- + +## Options ``` - -concurrency int - Number of requests to make in parallel (default 1) - -output string - File to save results in (default "output.txt") - -sub string - list of comma-separated substrings to look for in snapshots (snapshots will only be considered if they contnain one of them) (default "Disallow,disallow") - -url string - URL to unify versions of (without protocol prefix) (default "site.com/robots.txt") +Usage: chronos + -c int + Number of concurrent threads (default 10) + -e string + Extract regex + -fm string + Filter Mime codes + -from string + Match results after a specific date (Format: yyyyMMddhhmmss) + -fs string + Filter status codes + -m string + Match regex + -mm string + Match Mime codes + -ms string + Match status codes (default "200") + -o string + Output file path (default "output.txt") + -od string + Directory path to store matched results' entire pages + -p string + Path to add to the URL + -preset string + Preset name + -t string + Target URL/domain (supports wildcards) + -to string + Match results before a specific date (Format: yyyyMMddhhmmss) + -u URLs only ``` -The settings are by default suitable for unifying robots.txt files. Feel free to change the value of `-sub` to anything else, or supply an empty string to get all versions of a file without filtering. +--- -**Note:** Lines are saved *unordered* for performance reasons \ No newline at end of file +## Contributing +Find a bug? Got a feature request? Have an interesting preset in mind? Issues and pull requests are always welcome :) \ No newline at end of file diff --git a/examples/robots.json b/examples/robots.json new file mode 100644 index 0000000..38e0669 --- /dev/null +++ b/examples/robots.json @@ -0,0 +1,5 @@ +{ + "path": "/robots.txt", + "match": "Disallow", + "extract": "(?:\\s)(/.*)" +} diff --git a/examples/s3.json b/examples/s3.json new file mode 100644 index 0000000..dcb499a --- /dev/null +++ b/examples/s3.json @@ -0,0 +1,3 @@ +{ + "extract": "(?i)^([https:\\/\\/]*s3\\.amazonaws.com[\\/]+.*|[a-zA-Z0-9_-]*\\.s3\\.amazonaws.com\\/.*)$" +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..5ebbbf7 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/mhmdiaa/chronos + +go 1.16 diff --git a/main.go b/main.go new file mode 100644 index 0000000..f1098d1 --- /dev/null +++ b/main.go @@ -0,0 +1,370 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "io/ioutil" + "log" + "net/http" + "os" + "os/user" + "path/filepath" + "regexp" + "strings" + "sync" +) + +type config struct { + Target string + MatchRegex string + ExtractRegex string + Path string + Concurrency int + URLsOnly bool + + Filters filters + + OutputFile string + outputDirectory string +} + +type filters struct { + From string + To string + StatusMatchList string + StatusFilterList string + MimeMatchList string + MimeFilterList string +} + +type preset struct { + Path string `json:"path"` + MatchRegex string `json:"match"` + ExtractRegex string `json:"extract"` +} + +func main() { + flag.Usage = usage + config := CreateConfig() + + if config.Target == "" { + usage() + os.Exit(1) + } + + // If a path is provided, append it to the target + if config.Path != "" { + config.Target = strings.TrimRight(config.Target, "/") + config.Path + } + fmt.Printf("[*] Target: %s\n", config.Target) + + snapshots, err := getListOfSnapshots(config.Target, config.Filters, config.URLsOnly) + if err != nil { + log.Fatal(err) + } + fmt.Printf("[*] Found %d snapshots\n", len(snapshots)) + + if config.URLsOnly { + err := writeURLsToFile(config.OutputFile, snapshots) + if err != nil { + log.Fatal(err) + } + os.Exit(0) + } + + if config.outputDirectory != "" { + err = os.MkdirAll(config.outputDirectory, os.ModePerm) + if err != nil { + log.Fatal(err) + } + } + + // Global variable where the results are collected from all concurrent goroutines + allResults := struct { + sync.RWMutex + results map[string]bool + }{results: make(map[string]bool)} + + limit := make(chan bool, config.Concurrency) + + for _, snapshot := range snapshots { + limit <- true + go func(snapshot []string) { + defer func() { <-limit }() + fmt.Printf("[*] Requesting snapshot %s/%s\n", snapshot[0], snapshot[1]) + results, err := MatchAndExtractFromSnapshot(snapshot, config.MatchRegex, config.ExtractRegex, config.outputDirectory) + + if err != nil { + fmt.Printf("[X] %v\n", err) + } else { + fmt.Printf("[*] Found %d matches in %s/%s\n", len(results), snapshot[0], snapshot[1]) + for _, result := range results { + allResults.Lock() + allResults.results[result] = true + allResults.Unlock() + } + } + }(snapshot) + } + err = writeSetToFile(config.OutputFile, allResults.results) + if err != nil { + log.Fatal(err) + } +} + +func usage() { + fmt.Println("Usage: chronos ") + flag.PrintDefaults() + fmt.Println() + fmt.Println() + fmt.Println("EXAMPLE USAGE") + fmt.Println(" Extract paths from robots.txt files and build a wordlist") + fmt.Println(" chronos -t example.com -p /robots.txt -m Disallow -e \"(?:\\s)(/.*)\" -o robots_wordlist.txt") + fmt.Println() + fmt.Println(" Save all versions of a web page locally and filter out a specifc status code") + fmt.Println(" chronos -t http://example.com/this_is_403_now_but_has_it_always_been_like_this_question_mark -fs 403 -od output") + fmt.Println() + fmt.Println(" Save URLs of all subdomains of example.com that were last seen in 2015") + fmt.Println(" chronos -t *.example.com -u -to 2015") +} + +func CreateConfig() config { + var c config + + // General options + flag.StringVar(&c.Target, "t", "", "Target URL/domain (supports wildcards)") + flag.StringVar(&c.MatchRegex, "m", "", "Match regex") + flag.StringVar(&c.ExtractRegex, "e", "", "Extract regex") + flag.StringVar(&c.Path, "p", "", "Path to add to the URL") + flag.IntVar(&c.Concurrency, "c", 10, "Number of concurrent threads") + flag.BoolVar(&c.URLsOnly, "u", false, "URLs only") + flag.StringVar(&c.OutputFile, "o", "output.txt", "Output file path") + flag.StringVar(&c.outputDirectory, "od", "", "Directory path to store matched results' entire pages") + var presetName string + flag.StringVar(&presetName, "preset", "", "Preset name") + + // Filter options + flag.StringVar(&c.Filters.From, "from", "", "Match results after a specific date (Format: yyyyMMddhhmmss)") + flag.StringVar(&c.Filters.To, "to", "", "Match results before a specific date (Format: yyyyMMddhhmmss)") + flag.StringVar(&c.Filters.StatusMatchList, "ms", "200", "Match status codes") + flag.StringVar(&c.Filters.StatusFilterList, "fs", "", "Filter status codes") + flag.StringVar(&c.Filters.MimeMatchList, "mm", "", "Match Mime codes") + flag.StringVar(&c.Filters.MimeFilterList, "fm", "", "Filter Mime codes") + + flag.Parse() + + // Overwrite the parameters if a preset is passed + if presetName != "" { + preset, err := parsePreset(presetName) + if err != nil { + fmt.Printf("[X] Error loading the preset: %v\n", err) + fmt.Println("[X] Preset will be ignored") + } else { + if preset.MatchRegex != "" { + c.MatchRegex = preset.MatchRegex + } + if preset.ExtractRegex != "" { + c.ExtractRegex = preset.ExtractRegex + } + if preset.ExtractRegex != "" { + c.Path = preset.Path + } + } + } + return c +} + +func getPresetDir() (string, error) { + usr, err := user.Current() + if err != nil { + return "", err + } + + path := filepath.Join(usr.HomeDir, ".chronos") + if _, err := os.Stat(path); !os.IsNotExist(err) { + return path, nil + } else { + return "", err + } +} + +func parsePreset(presetName string) (preset, error) { + presetDir, err := getPresetDir() + if err != nil { + return preset{}, err + } + + presetFile := filepath.Join(presetDir, presetName+".json") + + var p preset + file, err := ioutil.ReadFile(presetFile) + if err != nil { + return preset{}, err + } + err = json.Unmarshal(file, &p) + if err != nil { + return preset{}, err + } + return p, nil +} + +func ConvertCommaSeparatedListToURLParams(list string, filter string, negative bool) string { + params := "" + var filterParam string + if negative { + filterParam = "&filter=!%s:%s" + } else { + filterParam = "&filter=%s:%s" + } + + if list != "" { + for _, item := range strings.Split(list, ",") { + params += fmt.Sprintf(filterParam, filter, item) + } + } + return params +} + +// Search WaybackMachine for a given URL and returns a slice of [timestamp, url] slices +func getListOfSnapshots(target string, filters filters, removeDuplicateURLs bool) ([][]string, error) { + searchURL := "https://web.archive.org/cdx/search/cdx?output=json" + searchURL += "&url=" + target + if filters.From != "" { + searchURL += "&from=" + filters.From + } + if filters.To != "" { + searchURL += "&to=" + filters.To + } + searchURL += ConvertCommaSeparatedListToURLParams(filters.StatusMatchList, "statuscode", false) + searchURL += ConvertCommaSeparatedListToURLParams(filters.StatusFilterList, "statuscode", true) + searchURL += ConvertCommaSeparatedListToURLParams(filters.MimeMatchList, "mimetype", false) + searchURL += ConvertCommaSeparatedListToURLParams(filters.MimeFilterList, "mimetype", true) + if removeDuplicateURLs { + searchURL += "&collapse=urlkey&fl=original" + } else { + searchURL += "&collapse=digest&fl=timestamp,original" + } + + resp, err := http.Get(searchURL) + if err != nil { + return nil, fmt.Errorf("Error while loading search results: %v", err) + } + defer resp.Body.Close() + + var results [][]string + body, err := ioutil.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("Error while reading search results: %v", err) + } + + err = json.Unmarshal(body, &results) + if err != nil { + return nil, fmt.Errorf("Error while deserializing search results: %v", err) + } + if len(results) == 0 { + return [][]string{}, fmt.Errorf("Didn't find any WaybackMachine entries for %s", target) + } + + // The first item in the list is metadata + return results[1:], nil +} + +// Get the content of a WaybackMachine snapshot and return it as a string +func getContentOfSnapshot(snapshot []string) (string, error) { + timestamp := snapshot[0] + original := snapshot[1] + + url := "https://web.archive.org/web/" + timestamp + "if_" + "/" + original + resp, err := http.Get(url) + if err != nil { + return "", fmt.Errorf("Error while loading a snapshot %s/%s: %v", timestamp, original, err) + } + defer resp.Body.Close() + + content, err := ioutil.ReadAll(resp.Body) + if err != nil { + return "", fmt.Errorf("Error while reading a snapshot %s/%s: %v", timestamp, original, err) + } + return string(content), nil +} + +func MatchAndExtractFromSnapshot(snapshot []string, mRegex, eRegex, outputDir string) ([]string, error) { + content, err := getContentOfSnapshot(snapshot) + if err != nil { + fmt.Println(err) + } + + if matchRegex(content, mRegex) { + if outputDir != "" { + filePath := filepath.Join(outputDir, snapshot[0]+"_"+strings.Replace(snapshot[1], "/", "_", -1)) + err = writeContentToFile(filePath, content) + if err != nil { + fmt.Println(err) + } + } + results := extractRegex(content, eRegex) + return results, nil + } else { + return []string{}, nil + } +} + +// Returns true if a page's content contains a string that matches the refex +func matchRegex(content, regex string) bool { + r := regexp.MustCompile(regex) + matches := r.Find([]byte(content)) + + if matches != nil { + return true + } + return false +} + +// Extract all matches of a regex from a page's content +func extractRegex(content, regex string) []string { + r := regexp.MustCompile(regex) + var paths []string + matches := r.FindAllStringSubmatch(string(content), -1) + for _, i := range matches { + paths = append(paths, i[1]) + } + return paths +} + +func writeURLsToFile(filePath string, values [][]string) error { + f, err := os.Create(filePath) + if err != nil { + return err + } + defer f.Close() + for _, value := range values { + fmt.Fprintln(f, value[0]) + } + return nil +} + +func writeSetToFile(filePath string, values map[string]bool) error { + f, err := os.Create(filePath) + if err != nil { + return err + } + defer f.Close() + for value := range values { + fmt.Fprintln(f, value) + } + return nil +} + +func writeContentToFile(filePath string, content string) error { + f, err := os.Create(filePath) + if err != nil { + return err + } + defer f.Close() + _, err = fmt.Fprintln(f, content) + if err != nil { + return err + } + return nil +} diff --git a/waybackunifier.go b/waybackunifier.go deleted file mode 100644 index 831dd12..0000000 --- a/waybackunifier.go +++ /dev/null @@ -1,131 +0,0 @@ -package main - -import ( - "encoding/json" - "flag" - "fmt" - "io/ioutil" - "log" - "net/http" - "os" - "strings" - "sync" -) - -var results = struct { - sync.RWMutex - res map[string]struct{} -}{res: make(map[string]struct{})} - -func main() { - url := flag.String("url", "site.com/robots.txt", "URL to unify versions of (without protocol prefix)") - output := flag.String("output", "output.txt", "File to save results in") - concurrency := flag.Int("concurrency", 1, "Number of requests to make in parallel") - substrings := flag.String("sub", "Disallow,disallow", "list of comma-separated substrings to look for in snapshots (snapshots will only be considered if they contnain one of them)") - - flag.Parse() - var subs []string - - for _, sub := range strings.Split(*substrings, ",") { - subs = append(subs, sub) - } - - snapshots, err := getSnapshots(*url) - if err != nil { - log.Fatalf("couldn't get snapshots: %v", err) - } - fmt.Printf("[*] Found %d snapshots", len(snapshots)) - - lim := make(chan bool, *concurrency) - for _, snapshot := range snapshots { - lim <- true - go func(snapshot []string) { - defer func() { <-lim }() - unifySnapshots(snapshot, subs) - if err != nil { - log.Printf("couldn't unify snapshots: %v", err) - } - }(snapshot) - } - - for i := 0; i < cap(lim); i++ { - lim <- true - } - - r := "" - for i := range results.res { - r += i + "\n" - } - f, err := os.Create(*output) - if err != nil { - log.Fatalf("couldn't create output file: %v", err) - } - defer f.Close() - - f.Write([]byte(r)) -} - -func unifySnapshots(snapshot []string, subs []string) { - content, err := getContent(snapshot) - if err != nil { - log.Printf("couldn't fetch snapshot: %v", err) - } - if len(subs) > 0 { - foundSub := false - for _, sub := range subs { - if strings.Contains(content, sub) { - foundSub = true - } - } - if !foundSub { - log.Printf("snapshot %s/%s doesn't contain any substring", snapshot[0], snapshot[1]) - } - } - c := strings.Split(content, "\n") - for _, line := range c { - results.Lock() - if line != "" { - results.res[line] = struct{}{} - } - results.Unlock() - } -} - -func getSnapshots(url string) ([][]string, error) { - resp, err := http.Get("https://web.archive.org/cdx/search/cdx?url=" + url + "&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest") - if err != nil { - return nil, fmt.Errorf("coudln't load waybackmachine search results for %s: %v", url, err) - } - defer resp.Body.Close() - - var results [][]string - body, err := ioutil.ReadAll(resp.Body) - if err != nil { - return nil, fmt.Errorf("couldn't read waybackmachine search results for %s: %v", url, err) - } - - err = json.Unmarshal(body, &results) - if err != nil { - return nil, fmt.Errorf("coudln't deserialize JSON response from waybackmachine for %s: %v", url, err) - } - if len(results) == 0 { - return make([][]string, 0), fmt.Errorf("") - } - return results[1:], nil -} - -func getContent(snapshot []string) (string, error) { - timestamp := snapshot[0] - original := snapshot[1] - url := "https://web.archive.org/web/" + timestamp + "if_" + "/" + original - resp, err := http.Get(url) - if err != nil { - return "", fmt.Errorf("couldn't load snapshot for %s/%s: %v", timestamp, original, err) - } - defer resp.Body.Close() - content, err := ioutil.ReadAll(resp.Body) - if err != nil { - return "", fmt.Errorf("couldn't read snapshot content for %s/%s: %v", timestamp, original, err) - } - return string(content), nil -}