From e22e3f5a1d9e51b6fd12473554fb51d308f0dc8e Mon Sep 17 00:00:00 2001 From: Ruslan Date: Tue, 19 Nov 2024 14:51:41 +0200 Subject: [PATCH] feat: domain names are no longer duplicated. Add support of ABP parsing. Add goroutines for to parse blocklists faster. --- internal/hostsfile/processor.go | 134 ++++++++++++++++++++------- internal/hostsfile/processor_test.go | 5 + 2 files changed, 105 insertions(+), 34 deletions(-) create mode 100644 internal/hostsfile/processor_test.go diff --git a/internal/hostsfile/processor.go b/internal/hostsfile/processor.go index 95a7a13..bce685d 100644 --- a/internal/hostsfile/processor.go +++ b/internal/hostsfile/processor.go @@ -5,6 +5,7 @@ import ( "barrier/internal/http" "fmt" "strings" + "sync" "github.com/rs/zerolog/log" ) @@ -13,6 +14,7 @@ const localhost = "127.0.0.1" // Processor is a structure that is responsible for processing blocklists. type Processor struct { + wg *sync.WaitGroup config *config.Config httpClient *http.HTTP } @@ -22,15 +24,15 @@ type Result struct { startTag string endTag string descriptionComment string - parsedBlocklists []ParsedBlocklist + domainsBlocklist map[string]LineContent } -// ParsedBlocklist represents a completed result of blocklist +// BlocklistResult represents a parsed result of blocklist // that is ready to be appended into hosts file. -type ParsedBlocklist struct { +type BlocklistResult struct { DomainsCount int - linesContent []LineContent + linesContent map[string]LineContent } type LineContent struct { @@ -43,6 +45,7 @@ func NewProcessor(config *config.Config) *Processor { httpClient := http.New() return &Processor{ + wg: &sync.WaitGroup{}, config: config, httpClient: httpClient, } @@ -51,91 +54,154 @@ func NewProcessor(config *config.Config) *Processor { // Process processes blocklists and returns a finished result // that is ready to save to hosts file. func (p *Processor) Process() (Result, error) { - parsedBlocklists := make([]ParsedBlocklist, 0, len(p.config.Blocklists)) + blocklistsResult := make([]BlocklistResult, len(p.config.Blocklists)) + totalDomainsCount := 0 - for _, blocklist := range p.config.Blocklists { + for i, blocklist := range p.config.Blocklists { + i := i target := blocklist.Target - log.Info().Str("target", target).Msg("processing blocklist..") + p.wg.Add(1) - fileContent, err := p.httpClient.Get(target) - if err != nil { - log.Error().Err(err).Str("target", target).Msg("failed to process blocklist") - continue - } + go func() { + defer p.wg.Done() - parsedBlocklist := p.processBlocklist(fileContent) - log.Info().Str("target", target).Msgf("number of %d domains parsed", parsedBlocklist.DomainsCount) + blocklistResult, err := p.processBlocklist(target) + if err != nil { + log.Error().Err(err).Str("target", target).Msg("failed to process blocklist") + return + } - parsedBlocklists = append(parsedBlocklists, parsedBlocklist) + blocklistsResult[i] = blocklistResult + totalDomainsCount += blocklistResult.DomainsCount + }() } + p.wg.Wait() + + domainsBlocklist := p.domainsBlocklist(blocklistsResult) + result := Result{ startTag: StartTag, endTag: EndTag, descriptionComment: DescriptionComment, - parsedBlocklists: parsedBlocklists, + domainsBlocklist: domainsBlocklist, } + log.Info().Msgf("total number of uniq domains: %d", len(domainsBlocklist)) + return result, nil } -func (p *Processor) processBlocklist(content string) ParsedBlocklist { - lines := strings.Split(content, "\n") +func (p *Processor) processBlocklist(target string) (BlocklistResult, error) { + log.Info().Str("target", target).Msg("processing blocklist..") + + fileContent, err := p.httpClient.Get(target) + if err != nil { + return BlocklistResult{}, err + } - linesContent := make([]LineContent, 0, len(lines)) + linesContent := p.processContent(fileContent) + + blocklistResult := BlocklistResult{ + DomainsCount: len(linesContent), + linesContent: linesContent, + } + + log.Info().Str("target", target).Msgf("number of domains: %d", blocklistResult.DomainsCount) + + return blocklistResult, nil +} + +func (p *Processor) processContent(content string) map[string]LineContent { + lines := strings.Split(content, "\n") + linesContent := make(map[string]LineContent) for _, line := range lines { // remove empty spaces line := strings.TrimSpace(line) - // skip empty lines and comments - if line == "" || p.isLineComment(line) { + // skip empty lines, comments, ABP comments and ABP headers + if line == "" || p.isLineComment(line) || p.isABPComment(line) || p.isABPHeader(line) { continue } + // remove a comment in the middle of line line = p.removeInLineComment(line) - lineContent := LineContent{ - ipAddress: localhost, + // Convert all characters to lowercase + line = strings.ToLower(line) + + if p.isABPDomain(line) { + line = p.parseABPDomain(line) } + var domainName string parts := strings.Fields(line) if len(parts) == 1 { - lineContent.domainName = parts[0] + domainName = parts[0] } else { - lineContent.domainName = parts[1] + domainName = parts[1] } - linesContent = append(linesContent, lineContent) + lineContent := LineContent{ + ipAddress: localhost, + domainName: domainName, + } + + linesContent[domainName] = lineContent } - parsedBlocklist := ParsedBlocklist{ - linesContent: linesContent, - DomainsCount: len(linesContent), + return linesContent +} + +func (p *Processor) domainsBlocklist(blocklistsResult []BlocklistResult) map[string]LineContent { + domainsBlocklist := make(map[string]LineContent) + + for _, result := range blocklistsResult { + for _, line := range result.linesContent { + domainsBlocklist[line.domainName] = line + } } - return parsedBlocklist + return domainsBlocklist } func (p *Processor) isLineComment(line string) bool { return strings.HasPrefix(line, "#") } +// Example: https://raw.githubusercontent.com/hagezi/dns-blocklists/main/adblock/light.txt +func (p *Processor) isABPComment(line string) bool { + return strings.HasPrefix(line, "!") +} + +func (p *Processor) isABPHeader(line string) bool { + return strings.HasPrefix(line, "[") +} + func (p *Processor) removeInLineComment(line string) string { return strings.Split(line, "#")[0] } +func (p *Processor) isABPDomain(line string) bool { + return strings.HasPrefix(line, "||") && strings.HasSuffix(line, "^") +} + +func (p *Processor) parseABPDomain(line string) string { + line = strings.TrimPrefix(line, "||") + line = strings.TrimSuffix(line, "^") + return line +} + func (r Result) FormatToHostsfile() string { var builder strings.Builder builder.WriteString(r.startTag) builder.WriteString(r.descriptionComment) - for _, parsedBlocklist := range r.parsedBlocklists { - for _, lineContent := range parsedBlocklist.linesContent { - builder.WriteString(lineContent.Format()) - } + for _, domain := range r.domainsBlocklist { + builder.WriteString(domain.Format()) } builder.WriteString(r.endTag) diff --git a/internal/hostsfile/processor_test.go b/internal/hostsfile/processor_test.go new file mode 100644 index 0000000..2b19775 --- /dev/null +++ b/internal/hostsfile/processor_test.go @@ -0,0 +1,5 @@ +package hostsfile + +import "testing" + +func TestIsLineComment(t *testing.T) {}