.generate

#!/usr/bin/env python3

import json
import re
import urllib.request
import sys
import hashlib
import html

class disposableHostGenerator():
    sources = {
        'list': [ 'https://gist.githubusercontent.com/adamloving/4401361/raw/66688cf8ad890433b917f3230f44489aa90b03b7',
                  'https://gist.githubusercontent.com/michenriksen/8710649/raw/d42c080d62279b793f211f0caaffb22f1c980912',
                  'https://raw.githubusercontent.com/wesbos/burner-email-providers/master/emails.txt',
                  'https://raw.githubusercontent.com/andreis/disposable/master/blacklist.txt' ],
        'file': [ 'blacklist.txt' ],
        'json': [ 'https://raw.githubusercontent.com/ivolo/disposable-email-domains/master/index.json',
                  'https://robot-mail.com/api/v1/domains' ],
        'sha1': [ 'https://raw.githubusercontent.com/GeroldSetz/Mailinator-Domains/master/mailinator_domains_from_bdea.cc.txt' ],
        'html': [ 'https://tempr.email', 'https://emailfake.com', 'https://www.guerrillamail.com/en/' ],
        'option-select-box': [ 'https://temp-mail.org/en/option/change/', 'https://spamwc.de/' ],
        'whitelist': [ 'https://raw.githubusercontent.com/andreis/disposable/master/whitelist.txt' ],
        'whitelist_file': [ 'whitelist.txt' ]
    }

    scrape_sources = [ 'https://emailfake.com' ]

    domain_regex = re.compile(r'^[a-z\d-]{,63}(\.[a-z\d-]{,63})+$')
    sha1_regex = re.compile(r'^[a-fA-F0-9]{40}')
    html_re = {
        'generic': re.compile("""<option[^>]*>@?([a-z0-9\-\.\&#;\d+]+)\s*(\(PW\))?<\/option>""", re.I),
        'tempr.email': re.compile("""<option\s+value[^>]*>@?([a-z0-9\-\.\&#;\d+]+)\s*(\(PW\))?<\/option>""", re.I),
        'emailfake.com': re.compile("""change_dropdown_list[^"]+"[^>]+>@?([a-z0-9\.-]{1,128})""", re.I)
    }

    def __init__(self, options = None, out_file = None):
        self.no_mx = {}
        self.domains = {}
        self.sha1 = {}
        self.old_domains = {}
        self.old_sha1 = {}
        self.legacy_domains = {}
        self.source_map = {}
        self.skip = []
        self.scrape = []

        if not options:
            options = {}

        self.options = options
        self.supported_formats = list(self.sources.keys())
        self.out_file = 'domains' if out_file is None else out_file

    def verbosePrint(self, msg):
        if self.options.get('verbose'):
            print(msg)

    def process_source(self, url, fmt, encoding='utf-8', timeout=3, retry=None):
        if fmt not in self.supported_formats:
            return
        retry = 0 if retry is None else retry
        if retry > 9:
            self.verbosePrint("Retry {0} for {1}".format(retry, url))

        lines = []
        if fmt in ('file', 'whitelist_file'):
            try:
                with open(url, 'rb') as f:
                    lines = [line.strip().decode('utf8') for line in f]
            except IOError:
                return
        else:
            data = ''
            try:
                req = urllib.request.Request(
                    url,
                    data=None,
                    headers={
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
                    }
                )

                data = urllib.request.urlopen(req, timeout=timeout).read() or ''
            except Exception as err:
                self.verbosePrint('WRN Fetching URL {0} failed, see error: {1}'.format(url, err))
                return

        if fmt in ('whitelist', 'list'):
            lines = [line.decode(encoding) for line in data.splitlines()]
        elif fmt == 'json':
            raw = json.loads(data.decode(encoding))
            if not isinstance(raw, list):
                self.verbosePrint('WRN This URL does not contain a JSON array: {0}'.format(url))
                return
            lines = list(filter(lambda line: line and isinstance(line, str), raw))
        elif fmt == 'option-select-box':
            dom_re = re.compile("""<option value="@?[^"]+">@?([a-z0-9\-\.]+\.[a-z0-9\-\.]+)<\/option>""", re.I)
            lines = dom_re.findall(data.decode(encoding))
        elif fmt == 'html':
            dom_re = self.html_re['generic']
            for (re_domain, re_item) in self.html_re.items():
                if re_domain != 'generic' and re_domain in url:
                    dom_re = re_item
                    break

            opts = dom_re.findall(data.decode(encoding))
            lines = list(map(lambda opt: html.unescape(opt[0]) if type(opt) is tuple else opt, opts))
        elif fmt == 'sha1':
            lines = data.splitlines()
            lines = [line.decode('ascii').lower() for line in lines]
            for sha1_str in lines:
                if not sha1_str or not self.sha1_regex.match(sha1_str):
                    continue

                self.sha1[sha1_str.lower()] = None
            return True

        lines = [line.lower().strip(' .,;@') for line in lines]
        lines = list(filter(lambda line: self.domain_regex.match(line), lines))

        if fmt in ('whitelist', 'whitelist_file'):
            for host in lines:
                if not host in self.skip:
                    self.skip.append(host)
            return True

        if not lines:
            self.verbosePrint('WRN No results for this source: {0}'.format(url))
            return

        self.source_map[url] = lines
        need_scrape = False

        for host in lines:
            self.domains[host] = None
            self.sha1[hashlib.sha1(host.encode('idna')).hexdigest()] = None
            self.legacy_domains[host] = None

            if url in self.scrape_sources and not host in self.scrape:
                self.scrape.append(host)
                need_scrape = True

        if need_scrape:
            return self.process_source(url, fmt, encoding, timeout, retry+1)

        return True

    """ read and compare to current (old) domains file
    """
    def readFiles(self):
        self.old_domains = {}
        try:
            with open(self.out_file + '.txt') as f:
                for line in f:
                    self.old_domains[line.strip()] = None
        except IOError:
            pass

        self.old_sha1 = {}
        try:
            with open(self.out_file + '_sha1.txt') as f:
                for line in f:
                    self.old_sha1[line.strip()] = None
        except IOError:
            pass

        self.legacy_domains = {}
        try:
            with open(self.out_file + '_legacy.txt') as f:
                for line in f:
                    self.legacy_domains[line.strip()] = None
        except IOError:
            pass

    """ merge all lists
    """
    def generate(self):
        # build domains dict
        for fmt in self.supported_formats:
            for src in self.sources[fmt]:
                if self.options.get('src_filter') is not None and \
                   not src == self.options.get('src_filter'):
                    continue

                try:
                    self.process_source(src, fmt)
                except Exception as err:
                    self.verbosePrint((src, fmt, err))
                    raise err

        # add custom whitelist
        for domain in self.skip:
            self.domains.pop(domain, None)
            self.sha1.pop(hashlib.sha1(domain.encode('idna')).hexdigest(), None)

        # MX verify check
        self.no_mx = []
        if self.options.get('dns_verify'):
            import dns.resolver
            for domain in self.domains.keys():
                valid = False
                try:
                    if dns.resolver.query(domain, 'MX'):
                        valid = True
                except KeyboardInterrupt:
                    raise
                except:
                    pass

                if not valid:
                    self.no_mx.append(domain)

        if self.options.get('verbose'):
            if not self.old_domains:
                self.readFiles()

            added = list(
                filter(lambda domain: domain not in self.old_domains, self.domains.keys()))
            removed = list(
                filter(lambda domain: domain not in self.domains, self.old_domains.keys()))

            added_sha1 = list(
                filter(lambda sha_str: sha_str not in self.old_sha1, self.sha1.keys()))
            removed_sha1 = list(
                filter(lambda sha_str: sha_str not in self.sha1, self.old_sha1.keys()))

            self.verbosePrint('Fetched {0} domains and {1} hashes'.format(len(self.domains), len(self.sha1)))
            if self.options.get('dns_verify'):
                self.verbosePrint(' - {0} domain(s) have no MX'.format(len(self.no_mx)))
            self.verbosePrint(' - {0} domain(s) added'.format(len(added)))
            self.verbosePrint(' - {0} domain(s) removed'.format(len(removed)))
            self.verbosePrint(' - {0} hash(es) added'.format(len(added_sha1)))
            self.verbosePrint(' - {0} hash(es) removed'.format(len(removed_sha1)))
            # stop if nothing has changed
            if len(added) == len(removed) == len(added_sha1) == len(removed_sha1) == 0:
                return False

            if self.options.get('src_filter'):
                self.verbosePrint(self.domains.keys())

        return True

    def writeToFile(self):
        # write new list to file(s)
        domains = list(self.domains.keys())
        domains.sort()
        with open(self.out_file + '.txt', 'w') as ff:
            ff.write('\n'.join(domains))

        with open(self.out_file + '.json', 'w') as ff:
            ff.write(json.dumps(domains))

        if self.options.get('source_map'):
            with open(self.out_file + '_source_map.txt', 'w') as ff:
                for (src_url, source_map_domains) in sorted(self.source_map.items()):
                    source_map_domains.sort()
                    ff.write(src_url + ':' + ('\n%s:' % src_url).join(source_map_domains) + "\n")

        if self.no_mx:
            domains_with_mx = self.domains
            for domain in self.no_mx:
                domains_with_mx.pop(domain, None)

            domains = list(domains_with_mx.keys())
            domains.sort()
            with open(self.out_file + '_mx.txt', 'w') as ff:
                ff.write('\n'.join(domains))

            with open(self.out_file + '_mx.json', 'w') as ff:
                ff.write(json.dumps(domains))

        # write new hash list to file(s)
        domains_sha1 = list(self.sha1.keys())
        domains_sha1.sort()
        with open(self.out_file + '_sha1.txt', 'w') as ff:
            ff.write('\n'.join(domains_sha1))

        with open(self.out_file + '_sha1.json', 'w') as ff:
            ff.write(json.dumps(domains_sha1))

if __name__ == '__main__':
    exit_status = 1
    options = {
        'dns_verify': True if '--dns-verify' in sys.argv else False,
        'source_map': True if '--source-map' in sys.argv else False,
        'src_filter': None,
        'verbose': False if '--quiet' in sys.argv else True
    }

    if '--src' in sys.argv:
        options['src_filter'] = sys.argv[sys.argv.index('--src')+1]

    dhg = disposableHostGenerator(options)
    if dhg.generate() or options.get('src_filter') is not None:
        exit_status = 0
        dhg.writeToFile()
    sys.exit(exit_status)