diff --git a/src/main/java/org/codelibs/riverweb/RiverWeb.java b/src/main/java/org/codelibs/riverweb/RiverWeb.java index ee8440c..66b9367 100644 --- a/src/main/java/org/codelibs/riverweb/RiverWeb.java +++ b/src/main/java/org/codelibs/riverweb/RiverWeb.java @@ -34,7 +34,8 @@ import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl; import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine; import org.codelibs.riverweb.app.service.ScriptService; -import org.codelibs.riverweb.entity.RiverConfig; +import org.codelibs.riverweb.config.RiverConfig; +import org.codelibs.riverweb.config.RiverConfigManager; import org.codelibs.riverweb.interval.WebRiverIntervalController; import org.codelibs.riverweb.util.ConfigProperties; import org.codelibs.riverweb.util.SettingsUtils; @@ -98,7 +99,7 @@ public class RiverWeb { protected ScriptService scriptService; @Resource - protected RiverConfig riverConfig; + protected RiverConfigManager riverConfigManager; @Resource protected String defaultUserAgent; @@ -236,6 +237,7 @@ private int crawl(Crawler crawler, String configId, String sessionId) { vars.put("client", esClient); vars.put("sessionId", sessionId); + final RiverConfig riverConfig = riverConfigManager.get(sessionId); try { // invoke execute event script executeScript(crawlSettings, vars, "execute"); @@ -452,6 +454,7 @@ private int crawl(Crawler crawler, String configId, String sessionId) { } finally { // invoke finish event script executeScript(crawlSettings, vars, "finish"); + riverConfigManager.remove(sessionId); if (cleanup) { crawler.cleanup(sessionId); diff --git a/src/main/java/org/codelibs/riverweb/entity/RiverConfig.java b/src/main/java/org/codelibs/riverweb/config/RiverConfig.java similarity index 94% rename from src/main/java/org/codelibs/riverweb/entity/RiverConfig.java rename to src/main/java/org/codelibs/riverweb/config/RiverConfig.java index dc202fd..ea033d7 100644 --- a/src/main/java/org/codelibs/riverweb/entity/RiverConfig.java +++ b/src/main/java/org/codelibs/riverweb/config/RiverConfig.java @@ -1,10 +1,11 @@ -package org.codelibs.riverweb.entity; +package org.codelibs.riverweb.config; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.codelibs.fess.crawler.entity.ResponseData; +import org.codelibs.riverweb.entity.ScrapingRule; public class RiverConfig { diff --git a/src/main/java/org/codelibs/riverweb/config/RiverConfigManager.java b/src/main/java/org/codelibs/riverweb/config/RiverConfigManager.java new file mode 100644 index 0000000..de80c51 --- /dev/null +++ b/src/main/java/org/codelibs/riverweb/config/RiverConfigManager.java @@ -0,0 +1,25 @@ +package org.codelibs.riverweb.config; + +import java.util.HashMap; +import java.util.Map; + +public class RiverConfigManager { + protected Map configMap = new HashMap<>(); + + public RiverConfig get(final String sessionId) { + synchronized (configMap) { + if (configMap.containsKey(sessionId)) { + return configMap.get(sessionId); + } + RiverConfig config = new RiverConfig(); + configMap.put(sessionId, config); + return config; + } + } + + public RiverConfig remove(final String sessionId) { + synchronized (configMap) { + return configMap.remove(sessionId); + } + } +} diff --git a/src/main/java/org/codelibs/riverweb/crawler/RwCrawlerThread.java b/src/main/java/org/codelibs/riverweb/crawler/RwCrawlerThread.java index 59f2264..cc51b55 100644 --- a/src/main/java/org/codelibs/riverweb/crawler/RwCrawlerThread.java +++ b/src/main/java/org/codelibs/riverweb/crawler/RwCrawlerThread.java @@ -6,7 +6,8 @@ import org.codelibs.fess.crawler.client.CrawlerClient; import org.codelibs.fess.crawler.client.EsClient; import org.codelibs.fess.crawler.entity.UrlQueue; -import org.codelibs.riverweb.entity.RiverConfig; +import org.codelibs.riverweb.config.RiverConfig; +import org.codelibs.riverweb.config.RiverConfigManager; import org.codelibs.riverweb.util.ConversionUtil; import org.elasticsearch.action.search.SearchResponse; import org.elasticsearch.index.query.QueryBuilders; @@ -22,7 +23,8 @@ public class RwCrawlerThread extends CrawlerThread { @Override protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue urlQueue) { - final RiverConfig riverConfig = SingletonLaContainer.getComponent(RiverConfig.class); + final RiverConfigManager riverConfigManager = SingletonLaContainer.getComponent(RiverConfigManager.class); + final RiverConfig riverConfig = riverConfigManager.get(crawlerContext.getSessionId()); if (riverConfig.isIncremental()) { final EsClient esClient = SingletonLaContainer.getComponent(EsClient.class); try { diff --git a/src/main/java/org/codelibs/riverweb/transformer/ScrapingTransformer.java b/src/main/java/org/codelibs/riverweb/transformer/ScrapingTransformer.java index 7b3d05b..7c000c2 100644 --- a/src/main/java/org/codelibs/riverweb/transformer/ScrapingTransformer.java +++ b/src/main/java/org/codelibs/riverweb/transformer/ScrapingTransformer.java @@ -43,7 +43,8 @@ import org.codelibs.fess.crawler.transformer.impl.HtmlTransformer; import org.codelibs.riverweb.WebRiverConstants; import org.codelibs.riverweb.app.service.ScriptService; -import org.codelibs.riverweb.entity.RiverConfig; +import org.codelibs.riverweb.config.RiverConfig; +import org.codelibs.riverweb.config.RiverConfigManager; import org.codelibs.riverweb.entity.ScrapingRule; import org.codelibs.riverweb.util.SettingsUtils; import org.elasticsearch.index.query.QueryBuilders; @@ -90,23 +91,30 @@ public class ScrapingTransformer extends HtmlTransformer { public String[] copiedResonseDataFields = new String[] { "url", "parentUrl", "httpStatusCode", "method", "charSet", "contentLength", "mimeType", "executionTime", "lastModified" }; - protected RiverConfig riverConfig; - private EsClient esClient; + protected RiverConfigManager riverConfigManager; + protected ThreadLocal> childUrlSetLocal = new ThreadLocal>(); + protected ThreadLocal riverConfigLocal = new ThreadLocal<>(); + + @PostConstruct public void init() { - riverConfig = SingletonLaContainer.getComponent(RiverConfig.class); esClient = SingletonLaContainer.getComponent(EsClient.class); + riverConfigManager = SingletonLaContainer.getComponent(RiverConfigManager.class); } @Override public ResultData transform(final ResponseData responseData) { + final RiverConfig riverConfig = riverConfigManager.get(responseData.getSessionId()); + try { + riverConfigLocal.set(riverConfig); return super.transform(responseData); } finally { + riverConfigLocal.remove(); childUrlSetLocal.remove(); } } @@ -114,7 +122,7 @@ public ResultData transform(final ResponseData responseData) { @Override protected void updateCharset(final ResponseData responseData) { int preloadSize = preloadSizeForCharset; - final ScrapingRule scrapingRule = riverConfig.getScrapingRule(responseData); + final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData); if (scrapingRule != null) { final Integer s = scrapingRule.getSetting("preloadSizeForCharset", Integer.valueOf(0)); if (s.intValue() > 0) { @@ -164,14 +172,14 @@ protected String loadCharset(final InputStream inputStream, final int preloadSiz @Override protected void storeData(final ResponseData responseData, final ResultData resultData) { - final ScrapingRule scrapingRule = riverConfig.getScrapingRule(responseData); - if (scrapingRule == null) { - logger.info("Skip Scraping: " + responseData.getUrl()); - return; - } - File file = null; try { + final ScrapingRule scrapingRule = riverConfigLocal.get().getScrapingRule(responseData); + if (scrapingRule == null) { + logger.info("Skip Scraping: " + responseData.getUrl()); + return; + } + file = File.createTempFile("river-web-", ".tmp"); CopyUtil.copy(responseData.getResponseBody(), file); processData(scrapingRule, file, responseData, resultData); @@ -540,6 +548,7 @@ protected void addPropertyData(final Map dataMap, final String k protected void storeIndex(final ResponseData responseData, final Map dataMap) { final String sessionId = responseData.getSessionId(); + final RiverConfig riverConfig = riverConfigLocal.get(); final String indexName = riverConfig.getIndex(); final String typeName = riverConfig.getType(); final boolean overwrite = riverConfig.isOverwrite(); diff --git a/src/main/resources/config.xml b/src/main/resources/config.xml index 06a8d4e..6780342 100644 --- a/src/main/resources/config.xml +++ b/src/main/resources/config.xml @@ -7,6 +7,6 @@ - + diff --git a/src/test/java/org/codelibs/riverweb/transformer/ScrapingTransformerTest.java b/src/test/java/org/codelibs/riverweb/transformer/ScrapingTransformerTest.java index a5ffef0..e4aa6d8 100644 --- a/src/test/java/org/codelibs/riverweb/transformer/ScrapingTransformerTest.java +++ b/src/test/java/org/codelibs/riverweb/transformer/ScrapingTransformerTest.java @@ -12,13 +12,14 @@ import org.codelibs.core.io.ResourceUtil; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; -import org.codelibs.riverweb.entity.RiverConfig; +import org.codelibs.riverweb.config.RiverConfig; +import org.codelibs.riverweb.config.RiverConfigManager; import org.junit.Test; public class ScrapingTransformerTest { @Test public void fess_codelibs_org() { - final RiverConfig riverConfig = new RiverConfig(); + final RiverConfigManager riverConfigManager = new RiverConfigManager(); final ScrapingTransformer transformer = new ScrapingTransformer() { @SuppressWarnings("unchecked") @Override @@ -31,10 +32,12 @@ protected void storeIndex(final ResponseData responseData, final Map) ((Map) dataMap.get("section2")).get("body")).size(), is(12)); } }; - transformer.riverConfig = riverConfig; + transformer.riverConfigManager = riverConfigManager; final String sessionId = "test"; final String url = "http://fess.codelibs.org/"; + final RiverConfig riverConfig = riverConfigManager.get(sessionId); + transformer.riverConfigLocal.set(riverConfig); final Map> scrapingRuleMap = new HashMap>(); addScrapingRuleMap(scrapingRuleMap, "text", "nav.sideMenus", "div.sidebar-nav ul li", Boolean.TRUE, Boolean.TRUE);