diff --git a/pom.xml b/pom.xml index 901bcd2..b85b927 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ com.github.hronom scrape-dat-website - 1.0.5 + 1.0.6 jar scrape-dat-website diff --git a/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java b/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java index 964c119..2ac755c 100644 --- a/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java +++ b/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java @@ -6,6 +6,10 @@ import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.github.hronom.scrape.dat.website.views.ScrapeView; +import com.ui4j.api.browser.BrowserEngine; +import com.ui4j.api.browser.BrowserFactory; +import com.ui4j.api.browser.Page; +import com.ui4j.api.browser.PageConfiguration; import org.apache.commons.lang3.StringEscapeUtils; import org.apache.logging.log4j.LogManager; @@ -15,10 +19,6 @@ import org.jsoup.nodes.Element; import org.jsoup.select.Elements; -import com.ui4j.api.browser.BrowserEngine; -import com.ui4j.api.browser.BrowserFactory; -import com.ui4j.api.browser.Page; - import java.awt.event.ActionEvent; import java.awt.event.ActionListener; import java.net.URL; @@ -29,9 +29,38 @@ public class ScrapeButtonController { private final ScrapeView scrapeView; + private final WebClient webClient; + private final BrowserEngine browserEngine; + public ScrapeButtonController(ScrapeView scrapeViewArg) { scrapeView = scrapeViewArg; - scrapeView.addScrapeButtonActionListener(createScrapeButtonActionListener2()); + scrapeView.addScrapeButtonActionListener(createScrapeButtonActionListener()); + + // Create HTMLUnit WebClient. + { + webClient = new WebClient(BrowserVersion.FIREFOX_38); + webClient.getOptions().setCssEnabled(true); + webClient.getOptions().setJavaScriptEnabled(true); + webClient.getOptions().setPopupBlockerEnabled(false); + webClient.getOptions().setRedirectEnabled(true); + webClient.getOptions().setActiveXNative(true); + webClient.getOptions().setAppletEnabled(true); + webClient.getOptions().setUseInsecureSSL(true); + webClient.getOptions().setThrowExceptionOnScriptError(false); + webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); + webClient.getCookieManager().setCookiesEnabled(true); + webClient.setAjaxController(new AjaxController() { + @Override + public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) { + return true; + } + }); + } + + // Create Ui4j BrowserEngine. + { + browserEngine = BrowserFactory.getWebKit(); + } } public ActionListener createScrapeButtonActionListener() { @@ -40,203 +69,166 @@ public ActionListener createScrapeButtonActionListener() { public void actionPerformed(ActionEvent event) { Executors.newSingleThreadExecutor().submit(new Runnable() { public void run() { - // Disable fields in view. - scrapeView.setWebsiteUrlTextFieldEnabled(false); - scrapeView.setSelectorTextFieldEnabled(false); - scrapeView.setScrapeButtonEnabled(false); - scrapeView.setWorkInProgress(true); - scrapeView.setOutput(""); - - scrapeView.setProgressBarTaskText("initializing"); - logger.info("Start processing..."); - long beginTime = System.currentTimeMillis(); - - // Output input parameters. - if (!scrapeView.getWebsiteUrl().isEmpty() && - !scrapeView.getSelector().isEmpty()) { - logger.info("Input parameters: \"" + - scrapeView.getWebsiteUrl() + "\", \"" + - scrapeView.getSelector() + "\", \""); + if (scrapeView.isUi4jEnabled()) { + processByUi4j(); + } else { + processByHtmlUnit(); } - - // Process. - try (WebClient webClient = createWebClient()) { - URL url = new URL(scrapeView.getWebsiteUrl()); - scrapeView.setProgressBarTaskText("requesting page"); - logger.info("Requesting page..."); - HtmlPage page = webClient.getPage(url); - logger.info("Requesting of page completed."); - - scrapeView.setProgressBarTaskText("viewing page as XML"); - logger.info("View page as XML"); - String xml = page.asXml(); - - // Unescape html. - scrapeView.setProgressBarTaskText("unescaping HTML"); - logger.info("Unescape html"); - xml = StringEscapeUtils.unescapeHtml4(xml); - - logger.info("Get selector"); - String selector = scrapeView.getSelector(); - if (!xml.isEmpty() && !selector.isEmpty()) { - scrapeView.setProgressBarTaskText("parsing HTML"); - logger.info("Parse HTML"); - Document doc = Jsoup.parse(xml); - - scrapeView.setProgressBarTaskText("selecting elements in HTML"); - logger.info("select elements in HTML"); - Elements selectedElements = doc.select(selector); - - if (!selectedElements.isEmpty()) { - scrapeView.setProgressBarTaskText("parsing selected elements"); - logger.info("Parse extracted elements"); - StringBuilder sb = new StringBuilder(); - for (Element element : selectedElements) { - String body = element.html(); - sb.append(body); - sb.append("\n"); - sb.append("\n"); - } - scrapeView.setOutput(sb.toString()); - } - } - } catch (Exception e) { - logger.error(e); - } - - long endTime = System.currentTimeMillis(); - logger.info("Process time: " + (endTime - beginTime) + " ms."); - logger.info("Processing complete."); - - // Enable fields in view. - scrapeView.setWorkInProgress(false); - scrapeView.setScrapeButtonEnabled(true); - scrapeView.setSelectorTextFieldEnabled(true); - scrapeView.setWebsiteUrlTextFieldEnabled(true); } }); } }; } - public ActionListener createScrapeButtonActionListener2() { - return new ActionListener() { - @Override - public void actionPerformed(ActionEvent event) { - Executors.newSingleThreadExecutor().submit(new Runnable() { - public void run() { - // Disable fields in view. - scrapeView.setWebsiteUrlTextFieldEnabled(false); - scrapeView.setSelectorTextFieldEnabled(false); - scrapeView.setScrapeButtonEnabled(false); - scrapeView.setWorkInProgress(true); - scrapeView.setOutput(""); - - scrapeView.setProgressBarTaskText("initializing"); - logger.info("Start processing..."); - long beginTime = System.currentTimeMillis(); - - // Output input parameters. - if (!scrapeView.getWebsiteUrl().isEmpty() && - !scrapeView.getSelector().isEmpty()) { - logger.info("Input parameters: \"" + - scrapeView.getWebsiteUrl() + "\", \"" + - scrapeView.getSelector() + "\", \""); - } - - // Process. - BrowserEngine browserEngine = createBrowserEngine(); - - // Navigate to blank page. - scrapeView.setProgressBarTaskText("requesting page"); - logger.info("Requesting page..."); - Page page = browserEngine.navigate(scrapeView.getWebsiteUrl()); - logger.info("Requesting of page completed."); - - scrapeView.setProgressBarTaskText("viewing page as XML"); - logger.info("View page as XML"); - String html = page.getDocument().getBody().getInnerHTML();; - - // Unescape html. - scrapeView.setProgressBarTaskText("unescaping HTML"); - logger.info("Unescape html"); - html = StringEscapeUtils.unescapeHtml4(html); - - logger.info("Get selector"); - String selector = scrapeView.getSelector(); - if (!html.isEmpty() && !selector.isEmpty()) { - scrapeView.setProgressBarTaskText("parsing HTML"); - logger.info("Parse HTML"); - Document doc = Jsoup.parse(html); - - scrapeView.setProgressBarTaskText("selecting elements in HTML"); - logger.info("select elements in HTML"); - Elements selectedElements = doc.select(selector); - - if (!selectedElements.isEmpty()) { - scrapeView.setProgressBarTaskText("parsing selected elements"); - logger.info("Parse extracted elements"); - StringBuilder sb = new StringBuilder(); - for (Element element : selectedElements) { - String body = element.html(); - sb.append(body); - sb.append("\n"); - sb.append("\n"); - } - scrapeView.setOutput(sb.toString()); - } - } - - long endTime = System.currentTimeMillis(); - logger.info("Process time: " + (endTime - beginTime) + " ms."); - logger.info("Processing complete."); - - // Enable fields in view. - scrapeView.setWorkInProgress(false); - scrapeView.setScrapeButtonEnabled(true); - scrapeView.setSelectorTextFieldEnabled(true); - scrapeView.setWebsiteUrlTextFieldEnabled(true); + public void processByHtmlUnit() { + // Disable fields in view. + scrapeView.setWebsiteUrlTextFieldEnabled(false); + scrapeView.setSelectorTextFieldEnabled(false); + scrapeView.setScrapeButtonEnabled(false); + scrapeView.setWorkInProgress(true); + scrapeView.setOutput(""); + + scrapeView.setProgressBarTaskText("initializing"); + logger.info("Start processing..."); + long beginTime = System.currentTimeMillis(); + + // Output input parameters. + if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { + logger.info("Input parameters: \"" + + scrapeView.getWebsiteUrl() + "\", \"" + + scrapeView.getSelector() + "\", \""); + } + + // Process. + try { + URL url = new URL(scrapeView.getWebsiteUrl()); + scrapeView.setProgressBarTaskText("requesting page"); + logger.info("Requesting page..."); + HtmlPage page = webClient.getPage(url); + logger.info("Requesting of page completed."); + + scrapeView.setProgressBarTaskText("viewing page as XML"); + logger.info("View page as XML"); + String xml = page.asXml(); + + // Unescape html. + scrapeView.setProgressBarTaskText("unescaping HTML"); + logger.info("Unescape html"); + xml = StringEscapeUtils.unescapeHtml4(xml); + + logger.info("Get selector"); + String selector = scrapeView.getSelector(); + if (!xml.isEmpty() && !selector.isEmpty()) { + scrapeView.setProgressBarTaskText("parsing HTML"); + logger.info("Parse HTML"); + Document doc = Jsoup.parse(xml); + + scrapeView.setProgressBarTaskText("selecting elements in HTML"); + logger.info("select elements in HTML"); + Elements selectedElements = doc.select(selector); + + if (!selectedElements.isEmpty()) { + scrapeView.setProgressBarTaskText("parsing selected elements"); + logger.info("Parse extracted elements"); + StringBuilder sb = new StringBuilder(); + for (Element element : selectedElements) { + String body = element.html(); + sb.append(body); + sb.append("\n"); + sb.append("\n"); } - }); + scrapeView.setOutput(sb.toString()); + } } - }; - } + } catch (Exception e) { + logger.error(e); + } - private WebClient createWebClient() { - WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38); - webClient.getOptions().setCssEnabled(true); - webClient.getOptions().setJavaScriptEnabled(true); - webClient.getOptions().setPopupBlockerEnabled(false); - webClient.getOptions().setRedirectEnabled(true); - webClient.getOptions().setActiveXNative(true); - webClient.getOptions().setAppletEnabled(true); - webClient.getOptions().setUseInsecureSSL(true); - webClient.getOptions().setThrowExceptionOnScriptError(false); - webClient.getOptions().setThrowExceptionOnFailingStatusCode(false); - webClient.setAjaxController(new AjaxController() { - @Override - public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) { - return true; - } - }); - return webClient; + webClient.close(); + + long endTime = System.currentTimeMillis(); + logger.info("Process time: " + (endTime - beginTime) + " ms."); + logger.info("Processing complete."); + + // Enable fields in view. + scrapeView.setWorkInProgress(false); + scrapeView.setScrapeButtonEnabled(true); + scrapeView.setSelectorTextFieldEnabled(true); + scrapeView.setWebsiteUrlTextFieldEnabled(true); } - private BrowserEngine createBrowserEngine() { - // Get the instance of the webkit. - BrowserEngine browser = BrowserFactory.getWebKit(); + public void processByUi4j() { + // Disable fields in view. + scrapeView.setWebsiteUrlTextFieldEnabled(false); + scrapeView.setSelectorTextFieldEnabled(false); + scrapeView.setScrapeButtonEnabled(false); + scrapeView.setWorkInProgress(true); + scrapeView.setOutput(""); + + scrapeView.setProgressBarTaskText("initializing"); + logger.info("Start processing..."); + long beginTime = System.currentTimeMillis(); + + // Output input parameters. + if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) { + logger.info("Input parameters: \"" + + scrapeView.getWebsiteUrl() + "\", \"" + + scrapeView.getSelector() + "\", \""); + } // Navigate to blank page. - // Page page = browser.navigate("about:blank"); - - // Show the browser page. + scrapeView.setProgressBarTaskText("requesting page"); + logger.info("Requesting page..."); + PageConfiguration pageConfig = new PageConfiguration(); + Page page = browserEngine.navigate(scrapeView.getWebsiteUrl(), pageConfig); //page.show(); - //System.setProperty("ui4j.headless", "true"); + logger.info("Requesting of page completed."); + + scrapeView.setProgressBarTaskText("viewing page as XML"); + logger.info("View page as XML"); + String html = page.getDocument().getBody().getInnerHTML(); + ; + + // Unescape html. + scrapeView.setProgressBarTaskText("unescaping HTML"); + logger.info("Unescape html"); + html = StringEscapeUtils.unescapeHtml4(html); + + logger.info("Get selector"); + String selector = scrapeView.getSelector(); + if (!html.isEmpty() && !selector.isEmpty()) { + scrapeView.setProgressBarTaskText("parsing HTML"); + logger.info("Parse HTML"); + Document doc = Jsoup.parse(html); + + scrapeView.setProgressBarTaskText("selecting elements in HTML"); + logger.info("select elements in HTML"); + Elements selectedElements = doc.select(selector); + + if (!selectedElements.isEmpty()) { + scrapeView.setProgressBarTaskText("parsing selected elements"); + logger.info("Parse extracted elements"); + StringBuilder sb = new StringBuilder(); + for (Element element : selectedElements) { + String body = element.html(); + sb.append(body); + sb.append("\n"); + sb.append("\n"); + } + scrapeView.setOutput(sb.toString()); + } + } + + browserEngine.clearCookies(); - //page.getDocument().getBody().getInnerHTML(); + long endTime = System.currentTimeMillis(); + logger.info("Process time: " + (endTime - beginTime) + " ms."); + logger.info("Processing complete."); - // Append html header to the document body. - //page.getDocument().getBody().append("

Hello, World!

"); - return browser; + // Enable fields in view. + scrapeView.setWorkInProgress(false); + scrapeView.setScrapeButtonEnabled(true); + scrapeView.setSelectorTextFieldEnabled(true); + scrapeView.setWebsiteUrlTextFieldEnabled(true); } } \ No newline at end of file diff --git a/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java b/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java index 3016feb..4a67d17 100644 --- a/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java +++ b/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java @@ -12,6 +12,7 @@ public class ScrapeView extends JPanel { private final JTextField websiteUrlTextField; private final JLabel selectorLabel; private final JTextField selectorTextField; + private final JCheckBox ui4jCheckBox; private final JButton scrapeButton; private final JTextArea outputTextArea; private final JProgressBar progressBar; @@ -56,7 +57,7 @@ public ScrapeView() { } { - selectorLabel = new JLabel("Selector:"); + selectorLabel = new JLabel("CSS Selector:"); constraint.weightx = 0; constraint.weighty = 0; @@ -82,7 +83,8 @@ public ScrapeView() { } { - scrapeButton = new JButton("Scrape website"); + ui4jCheckBox = new JCheckBox("Use Ui4j for headless browser"); + ui4jCheckBox.setSelected(false); constraint.weightx = 1; constraint.weighty = 0; @@ -91,6 +93,19 @@ public ScrapeView() { constraint.gridwidth = 2; constraint.gridheight = 1; constraint.fill = GridBagConstraints.BOTH; + this.add(ui4jCheckBox, constraint); + } + + { + scrapeButton = new JButton("Scrape website"); + + constraint.weightx = 1; + constraint.weighty = 0; + constraint.gridx = 0; + constraint.gridy = 3; + constraint.gridwidth = 2; + constraint.gridheight = 1; + constraint.fill = GridBagConstraints.BOTH; this.add(scrapeButton, constraint); } @@ -108,7 +123,7 @@ public ScrapeView() { constraint.weightx = 1; constraint.weighty = 1; constraint.gridx = 0; - constraint.gridy = 3; + constraint.gridy = 4; constraint.gridwidth = 2; constraint.gridheight = 1; constraint.fill = GridBagConstraints.BOTH; @@ -125,7 +140,7 @@ public ScrapeView() { constraint.weightx = 1; constraint.weighty = 0; constraint.gridx = 0; - constraint.gridy = 4; + constraint.gridy = 5; constraint.gridwidth = 2; constraint.gridheight = 1; constraint.fill = GridBagConstraints.BOTH; @@ -133,6 +148,10 @@ public ScrapeView() { } } + public boolean isUi4jEnabled() { + return ui4jCheckBox.isSelected(); + } + public void addScrapeButtonActionListener(ActionListener actionListener) { scrapeButton.addActionListener(actionListener); } @@ -161,7 +180,7 @@ public void setOutput(String text) { outputTextArea.setText(text); } - public void setWorkInProgress(boolean working){ + public void setWorkInProgress(boolean working) { progressBar.setVisible(working); }