diff --git a/pom.xml b/pom.xml
index 901bcd2..b85b927 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
com.github.hronom
scrape-dat-website
- 1.0.5
+ 1.0.6
jar
scrape-dat-website
diff --git a/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java b/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java
index 964c119..2ac755c 100644
--- a/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java
+++ b/src/main/java/com/github/hronom/scrape/dat/website/controllers/ScrapeButtonController.java
@@ -6,6 +6,10 @@
import com.gargoylesoftware.htmlunit.WebRequest;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.github.hronom.scrape.dat.website.views.ScrapeView;
+import com.ui4j.api.browser.BrowserEngine;
+import com.ui4j.api.browser.BrowserFactory;
+import com.ui4j.api.browser.Page;
+import com.ui4j.api.browser.PageConfiguration;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.logging.log4j.LogManager;
@@ -15,10 +19,6 @@
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
-import com.ui4j.api.browser.BrowserEngine;
-import com.ui4j.api.browser.BrowserFactory;
-import com.ui4j.api.browser.Page;
-
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.net.URL;
@@ -29,9 +29,38 @@ public class ScrapeButtonController {
private final ScrapeView scrapeView;
+ private final WebClient webClient;
+ private final BrowserEngine browserEngine;
+
public ScrapeButtonController(ScrapeView scrapeViewArg) {
scrapeView = scrapeViewArg;
- scrapeView.addScrapeButtonActionListener(createScrapeButtonActionListener2());
+ scrapeView.addScrapeButtonActionListener(createScrapeButtonActionListener());
+
+ // Create HTMLUnit WebClient.
+ {
+ webClient = new WebClient(BrowserVersion.FIREFOX_38);
+ webClient.getOptions().setCssEnabled(true);
+ webClient.getOptions().setJavaScriptEnabled(true);
+ webClient.getOptions().setPopupBlockerEnabled(false);
+ webClient.getOptions().setRedirectEnabled(true);
+ webClient.getOptions().setActiveXNative(true);
+ webClient.getOptions().setAppletEnabled(true);
+ webClient.getOptions().setUseInsecureSSL(true);
+ webClient.getOptions().setThrowExceptionOnScriptError(false);
+ webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
+ webClient.getCookieManager().setCookiesEnabled(true);
+ webClient.setAjaxController(new AjaxController() {
+ @Override
+ public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
+ return true;
+ }
+ });
+ }
+
+ // Create Ui4j BrowserEngine.
+ {
+ browserEngine = BrowserFactory.getWebKit();
+ }
}
public ActionListener createScrapeButtonActionListener() {
@@ -40,203 +69,166 @@ public ActionListener createScrapeButtonActionListener() {
public void actionPerformed(ActionEvent event) {
Executors.newSingleThreadExecutor().submit(new Runnable() {
public void run() {
- // Disable fields in view.
- scrapeView.setWebsiteUrlTextFieldEnabled(false);
- scrapeView.setSelectorTextFieldEnabled(false);
- scrapeView.setScrapeButtonEnabled(false);
- scrapeView.setWorkInProgress(true);
- scrapeView.setOutput("");
-
- scrapeView.setProgressBarTaskText("initializing");
- logger.info("Start processing...");
- long beginTime = System.currentTimeMillis();
-
- // Output input parameters.
- if (!scrapeView.getWebsiteUrl().isEmpty() &&
- !scrapeView.getSelector().isEmpty()) {
- logger.info("Input parameters: \"" +
- scrapeView.getWebsiteUrl() + "\", \"" +
- scrapeView.getSelector() + "\", \"");
+ if (scrapeView.isUi4jEnabled()) {
+ processByUi4j();
+ } else {
+ processByHtmlUnit();
}
-
- // Process.
- try (WebClient webClient = createWebClient()) {
- URL url = new URL(scrapeView.getWebsiteUrl());
- scrapeView.setProgressBarTaskText("requesting page");
- logger.info("Requesting page...");
- HtmlPage page = webClient.getPage(url);
- logger.info("Requesting of page completed.");
-
- scrapeView.setProgressBarTaskText("viewing page as XML");
- logger.info("View page as XML");
- String xml = page.asXml();
-
- // Unescape html.
- scrapeView.setProgressBarTaskText("unescaping HTML");
- logger.info("Unescape html");
- xml = StringEscapeUtils.unescapeHtml4(xml);
-
- logger.info("Get selector");
- String selector = scrapeView.getSelector();
- if (!xml.isEmpty() && !selector.isEmpty()) {
- scrapeView.setProgressBarTaskText("parsing HTML");
- logger.info("Parse HTML");
- Document doc = Jsoup.parse(xml);
-
- scrapeView.setProgressBarTaskText("selecting elements in HTML");
- logger.info("select elements in HTML");
- Elements selectedElements = doc.select(selector);
-
- if (!selectedElements.isEmpty()) {
- scrapeView.setProgressBarTaskText("parsing selected elements");
- logger.info("Parse extracted elements");
- StringBuilder sb = new StringBuilder();
- for (Element element : selectedElements) {
- String body = element.html();
- sb.append(body);
- sb.append("\n");
- sb.append("\n");
- }
- scrapeView.setOutput(sb.toString());
- }
- }
- } catch (Exception e) {
- logger.error(e);
- }
-
- long endTime = System.currentTimeMillis();
- logger.info("Process time: " + (endTime - beginTime) + " ms.");
- logger.info("Processing complete.");
-
- // Enable fields in view.
- scrapeView.setWorkInProgress(false);
- scrapeView.setScrapeButtonEnabled(true);
- scrapeView.setSelectorTextFieldEnabled(true);
- scrapeView.setWebsiteUrlTextFieldEnabled(true);
}
});
}
};
}
- public ActionListener createScrapeButtonActionListener2() {
- return new ActionListener() {
- @Override
- public void actionPerformed(ActionEvent event) {
- Executors.newSingleThreadExecutor().submit(new Runnable() {
- public void run() {
- // Disable fields in view.
- scrapeView.setWebsiteUrlTextFieldEnabled(false);
- scrapeView.setSelectorTextFieldEnabled(false);
- scrapeView.setScrapeButtonEnabled(false);
- scrapeView.setWorkInProgress(true);
- scrapeView.setOutput("");
-
- scrapeView.setProgressBarTaskText("initializing");
- logger.info("Start processing...");
- long beginTime = System.currentTimeMillis();
-
- // Output input parameters.
- if (!scrapeView.getWebsiteUrl().isEmpty() &&
- !scrapeView.getSelector().isEmpty()) {
- logger.info("Input parameters: \"" +
- scrapeView.getWebsiteUrl() + "\", \"" +
- scrapeView.getSelector() + "\", \"");
- }
-
- // Process.
- BrowserEngine browserEngine = createBrowserEngine();
-
- // Navigate to blank page.
- scrapeView.setProgressBarTaskText("requesting page");
- logger.info("Requesting page...");
- Page page = browserEngine.navigate(scrapeView.getWebsiteUrl());
- logger.info("Requesting of page completed.");
-
- scrapeView.setProgressBarTaskText("viewing page as XML");
- logger.info("View page as XML");
- String html = page.getDocument().getBody().getInnerHTML();;
-
- // Unescape html.
- scrapeView.setProgressBarTaskText("unescaping HTML");
- logger.info("Unescape html");
- html = StringEscapeUtils.unescapeHtml4(html);
-
- logger.info("Get selector");
- String selector = scrapeView.getSelector();
- if (!html.isEmpty() && !selector.isEmpty()) {
- scrapeView.setProgressBarTaskText("parsing HTML");
- logger.info("Parse HTML");
- Document doc = Jsoup.parse(html);
-
- scrapeView.setProgressBarTaskText("selecting elements in HTML");
- logger.info("select elements in HTML");
- Elements selectedElements = doc.select(selector);
-
- if (!selectedElements.isEmpty()) {
- scrapeView.setProgressBarTaskText("parsing selected elements");
- logger.info("Parse extracted elements");
- StringBuilder sb = new StringBuilder();
- for (Element element : selectedElements) {
- String body = element.html();
- sb.append(body);
- sb.append("\n");
- sb.append("\n");
- }
- scrapeView.setOutput(sb.toString());
- }
- }
-
- long endTime = System.currentTimeMillis();
- logger.info("Process time: " + (endTime - beginTime) + " ms.");
- logger.info("Processing complete.");
-
- // Enable fields in view.
- scrapeView.setWorkInProgress(false);
- scrapeView.setScrapeButtonEnabled(true);
- scrapeView.setSelectorTextFieldEnabled(true);
- scrapeView.setWebsiteUrlTextFieldEnabled(true);
+ public void processByHtmlUnit() {
+ // Disable fields in view.
+ scrapeView.setWebsiteUrlTextFieldEnabled(false);
+ scrapeView.setSelectorTextFieldEnabled(false);
+ scrapeView.setScrapeButtonEnabled(false);
+ scrapeView.setWorkInProgress(true);
+ scrapeView.setOutput("");
+
+ scrapeView.setProgressBarTaskText("initializing");
+ logger.info("Start processing...");
+ long beginTime = System.currentTimeMillis();
+
+ // Output input parameters.
+ if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
+ logger.info("Input parameters: \"" +
+ scrapeView.getWebsiteUrl() + "\", \"" +
+ scrapeView.getSelector() + "\", \"");
+ }
+
+ // Process.
+ try {
+ URL url = new URL(scrapeView.getWebsiteUrl());
+ scrapeView.setProgressBarTaskText("requesting page");
+ logger.info("Requesting page...");
+ HtmlPage page = webClient.getPage(url);
+ logger.info("Requesting of page completed.");
+
+ scrapeView.setProgressBarTaskText("viewing page as XML");
+ logger.info("View page as XML");
+ String xml = page.asXml();
+
+ // Unescape html.
+ scrapeView.setProgressBarTaskText("unescaping HTML");
+ logger.info("Unescape html");
+ xml = StringEscapeUtils.unescapeHtml4(xml);
+
+ logger.info("Get selector");
+ String selector = scrapeView.getSelector();
+ if (!xml.isEmpty() && !selector.isEmpty()) {
+ scrapeView.setProgressBarTaskText("parsing HTML");
+ logger.info("Parse HTML");
+ Document doc = Jsoup.parse(xml);
+
+ scrapeView.setProgressBarTaskText("selecting elements in HTML");
+ logger.info("select elements in HTML");
+ Elements selectedElements = doc.select(selector);
+
+ if (!selectedElements.isEmpty()) {
+ scrapeView.setProgressBarTaskText("parsing selected elements");
+ logger.info("Parse extracted elements");
+ StringBuilder sb = new StringBuilder();
+ for (Element element : selectedElements) {
+ String body = element.html();
+ sb.append(body);
+ sb.append("\n");
+ sb.append("\n");
}
- });
+ scrapeView.setOutput(sb.toString());
+ }
}
- };
- }
+ } catch (Exception e) {
+ logger.error(e);
+ }
- private WebClient createWebClient() {
- WebClient webClient = new WebClient(BrowserVersion.FIREFOX_38);
- webClient.getOptions().setCssEnabled(true);
- webClient.getOptions().setJavaScriptEnabled(true);
- webClient.getOptions().setPopupBlockerEnabled(false);
- webClient.getOptions().setRedirectEnabled(true);
- webClient.getOptions().setActiveXNative(true);
- webClient.getOptions().setAppletEnabled(true);
- webClient.getOptions().setUseInsecureSSL(true);
- webClient.getOptions().setThrowExceptionOnScriptError(false);
- webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
- webClient.setAjaxController(new AjaxController() {
- @Override
- public boolean processSynchron(HtmlPage page, WebRequest request, boolean async) {
- return true;
- }
- });
- return webClient;
+ webClient.close();
+
+ long endTime = System.currentTimeMillis();
+ logger.info("Process time: " + (endTime - beginTime) + " ms.");
+ logger.info("Processing complete.");
+
+ // Enable fields in view.
+ scrapeView.setWorkInProgress(false);
+ scrapeView.setScrapeButtonEnabled(true);
+ scrapeView.setSelectorTextFieldEnabled(true);
+ scrapeView.setWebsiteUrlTextFieldEnabled(true);
}
- private BrowserEngine createBrowserEngine() {
- // Get the instance of the webkit.
- BrowserEngine browser = BrowserFactory.getWebKit();
+ public void processByUi4j() {
+ // Disable fields in view.
+ scrapeView.setWebsiteUrlTextFieldEnabled(false);
+ scrapeView.setSelectorTextFieldEnabled(false);
+ scrapeView.setScrapeButtonEnabled(false);
+ scrapeView.setWorkInProgress(true);
+ scrapeView.setOutput("");
+
+ scrapeView.setProgressBarTaskText("initializing");
+ logger.info("Start processing...");
+ long beginTime = System.currentTimeMillis();
+
+ // Output input parameters.
+ if (!scrapeView.getWebsiteUrl().isEmpty() && !scrapeView.getSelector().isEmpty()) {
+ logger.info("Input parameters: \"" +
+ scrapeView.getWebsiteUrl() + "\", \"" +
+ scrapeView.getSelector() + "\", \"");
+ }
// Navigate to blank page.
- // Page page = browser.navigate("about:blank");
-
- // Show the browser page.
+ scrapeView.setProgressBarTaskText("requesting page");
+ logger.info("Requesting page...");
+ PageConfiguration pageConfig = new PageConfiguration();
+ Page page = browserEngine.navigate(scrapeView.getWebsiteUrl(), pageConfig);
//page.show();
- //System.setProperty("ui4j.headless", "true");
+ logger.info("Requesting of page completed.");
+
+ scrapeView.setProgressBarTaskText("viewing page as XML");
+ logger.info("View page as XML");
+ String html = page.getDocument().getBody().getInnerHTML();
+ ;
+
+ // Unescape html.
+ scrapeView.setProgressBarTaskText("unescaping HTML");
+ logger.info("Unescape html");
+ html = StringEscapeUtils.unescapeHtml4(html);
+
+ logger.info("Get selector");
+ String selector = scrapeView.getSelector();
+ if (!html.isEmpty() && !selector.isEmpty()) {
+ scrapeView.setProgressBarTaskText("parsing HTML");
+ logger.info("Parse HTML");
+ Document doc = Jsoup.parse(html);
+
+ scrapeView.setProgressBarTaskText("selecting elements in HTML");
+ logger.info("select elements in HTML");
+ Elements selectedElements = doc.select(selector);
+
+ if (!selectedElements.isEmpty()) {
+ scrapeView.setProgressBarTaskText("parsing selected elements");
+ logger.info("Parse extracted elements");
+ StringBuilder sb = new StringBuilder();
+ for (Element element : selectedElements) {
+ String body = element.html();
+ sb.append(body);
+ sb.append("\n");
+ sb.append("\n");
+ }
+ scrapeView.setOutput(sb.toString());
+ }
+ }
+
+ browserEngine.clearCookies();
- //page.getDocument().getBody().getInnerHTML();
+ long endTime = System.currentTimeMillis();
+ logger.info("Process time: " + (endTime - beginTime) + " ms.");
+ logger.info("Processing complete.");
- // Append html header to the document body.
- //page.getDocument().getBody().append("
Hello, World!
");
- return browser;
+ // Enable fields in view.
+ scrapeView.setWorkInProgress(false);
+ scrapeView.setScrapeButtonEnabled(true);
+ scrapeView.setSelectorTextFieldEnabled(true);
+ scrapeView.setWebsiteUrlTextFieldEnabled(true);
}
}
\ No newline at end of file
diff --git a/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java b/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java
index 3016feb..4a67d17 100644
--- a/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java
+++ b/src/main/java/com/github/hronom/scrape/dat/website/views/ScrapeView.java
@@ -12,6 +12,7 @@ public class ScrapeView extends JPanel {
private final JTextField websiteUrlTextField;
private final JLabel selectorLabel;
private final JTextField selectorTextField;
+ private final JCheckBox ui4jCheckBox;
private final JButton scrapeButton;
private final JTextArea outputTextArea;
private final JProgressBar progressBar;
@@ -56,7 +57,7 @@ public ScrapeView() {
}
{
- selectorLabel = new JLabel("Selector:");
+ selectorLabel = new JLabel("CSS Selector:");
constraint.weightx = 0;
constraint.weighty = 0;
@@ -82,7 +83,8 @@ public ScrapeView() {
}
{
- scrapeButton = new JButton("Scrape website");
+ ui4jCheckBox = new JCheckBox("Use Ui4j for headless browser");
+ ui4jCheckBox.setSelected(false);
constraint.weightx = 1;
constraint.weighty = 0;
@@ -91,6 +93,19 @@ public ScrapeView() {
constraint.gridwidth = 2;
constraint.gridheight = 1;
constraint.fill = GridBagConstraints.BOTH;
+ this.add(ui4jCheckBox, constraint);
+ }
+
+ {
+ scrapeButton = new JButton("Scrape website");
+
+ constraint.weightx = 1;
+ constraint.weighty = 0;
+ constraint.gridx = 0;
+ constraint.gridy = 3;
+ constraint.gridwidth = 2;
+ constraint.gridheight = 1;
+ constraint.fill = GridBagConstraints.BOTH;
this.add(scrapeButton, constraint);
}
@@ -108,7 +123,7 @@ public ScrapeView() {
constraint.weightx = 1;
constraint.weighty = 1;
constraint.gridx = 0;
- constraint.gridy = 3;
+ constraint.gridy = 4;
constraint.gridwidth = 2;
constraint.gridheight = 1;
constraint.fill = GridBagConstraints.BOTH;
@@ -125,7 +140,7 @@ public ScrapeView() {
constraint.weightx = 1;
constraint.weighty = 0;
constraint.gridx = 0;
- constraint.gridy = 4;
+ constraint.gridy = 5;
constraint.gridwidth = 2;
constraint.gridheight = 1;
constraint.fill = GridBagConstraints.BOTH;
@@ -133,6 +148,10 @@ public ScrapeView() {
}
}
+ public boolean isUi4jEnabled() {
+ return ui4jCheckBox.isSelected();
+ }
+
public void addScrapeButtonActionListener(ActionListener actionListener) {
scrapeButton.addActionListener(actionListener);
}
@@ -161,7 +180,7 @@ public void setOutput(String text) {
outputTextArea.setText(text);
}
- public void setWorkInProgress(boolean working){
+ public void setWorkInProgress(boolean working) {
progressBar.setVisible(working);
}