From eb9eb15f80f7b099164e6e11425aa70b7371c50c Mon Sep 17 00:00:00 2001 From: simon Date: Wed, 8 May 2024 09:55:18 +0800 Subject: [PATCH] extractor modify --- prerank-stages/extractor/main.go | 3 ++- prerank-stages/go.mod | 2 +- prerank-stages/go.sum | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/prerank-stages/extractor/main.go b/prerank-stages/extractor/main.go index d7a8e96..2a60eac 100644 --- a/prerank-stages/extractor/main.go +++ b/prerank-stages/extractor/main.go @@ -50,7 +50,8 @@ func main() { if rank.RawContent == "" { continue } - fullContent, pureContent, _, _, _, _, _, _ := processor.ArticleReadabilityExtractor(rank.RawContent, rank.Url, "", "", true) + //fullContent, pureContent, _, _, _, _, _, _ := processor.ArticleReadabilityExtractor(rank.RawContent, rank.Url, "", "", true) + fullContent, pureContent := processor.ArticleContentExtractor(rank.RawContent, rank.Url, "", "") var contentLen int if rank.Language != "zh-cn" { contentArr := strings.Split(pureContent, " ") diff --git a/prerank-stages/go.mod b/prerank-stages/go.mod index 40cc5c4..1d655de 100644 --- a/prerank-stages/go.mod +++ b/prerank-stages/go.mod @@ -3,7 +3,7 @@ module bytetrade.io/web3os/prerank_stages go 1.20 require ( - github.com/beclab/article-extractor v0.0.2 + github.com/beclab/article-extractor v0.0.5 github.com/redis/go-redis/v9 v9.5.1 go.mongodb.org/mongo-driver v1.15.0 go.uber.org/zap v1.27.0 diff --git a/prerank-stages/go.sum b/prerank-stages/go.sum index ade59d1..e4d07c3 100644 --- a/prerank-stages/go.sum +++ b/prerank-stages/go.sum @@ -2,8 +2,8 @@ github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VP github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= -github.com/beclab/article-extractor v0.0.2 h1:LGPSv+qAvBtlT40bzLzQNTol6Cex5M4K5EbiZKh25iA= -github.com/beclab/article-extractor v0.0.2/go.mod h1:ih8W3OrtbD586xkigBhfrKWD1GDFXEEbow3uVZ6U/bI= +github.com/beclab/article-extractor v0.0.5 h1:9CQIzS7Rj3qz03jvhXHbzv46PrbkUpgEnIPx40Xtpao= +github.com/beclab/article-extractor v0.0.5/go.mod h1:ih8W3OrtbD586xkigBhfrKWD1GDFXEEbow3uVZ6U/bI= github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= github.com/cespare/xxhash/v2 v2.2.0 h1:DC2CZ1Ep5Y4k3ZQ899DldepgrayRUGE6BBZ/cd9Cj44=