diff --git a/pom.xml b/pom.xml index d78692e..e420326 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 info.llort torrent-web-parser - 1.3 + 1.4 Torrent web parser diff --git a/src/main/java/info/llort/torrent/Config.java b/src/main/java/info/llort/torrent/Config.java index d3a88cf..98c3d67 100644 --- a/src/main/java/info/llort/torrent/Config.java +++ b/src/main/java/info/llort/torrent/Config.java @@ -1,7 +1,7 @@ package info.llort.torrent; public abstract class Config { - public static final String URL_WEB_TO_PARSE = "https://atomixhq.top"; + public static final String URL_WEB_TO_PARSE = "https://atomixhq.art"; public static final String FIREFOX_DRIVER_PATH = "/home/jllort/git/torrentwebparser/geckodriver"; public static final String FILESYSTEM_DOWNLOAD_PATH = "/home/jllort/Descargas"; public static long FILE_DOWNLOAD_TIMEOUT = 2; diff --git a/src/main/java/info/llort/torrent/Main.java b/src/main/java/info/llort/torrent/Main.java index 9055b72..3da71f3 100644 --- a/src/main/java/info/llort/torrent/Main.java +++ b/src/main/java/info/llort/torrent/Main.java @@ -1,7 +1,7 @@ package info.llort.torrent; import info.llort.torrent.util.Console; -import info.llort.torrent.util.PctmixWebParserV2; +import info.llort.torrent.util.PctmixWebParserV3; import net.lightbody.bmp.BrowserMobProxy; import net.lightbody.bmp.BrowserMobProxyServer; import net.lightbody.bmp.client.ClientUtil; @@ -108,7 +108,7 @@ public static void main(String[] args) { WebDriver driver = new FirefoxDriver(firefoxOptions); - PctmixWebParserV2.capture(urlWebToParse, geckoDriverPath, filters, downloadTimeOut, driver, proxy); + PctmixWebParserV3.capture(urlWebToParse, geckoDriverPath, filters, downloadTimeOut, driver, proxy); // closing the driver driver.close(); diff --git a/src/main/java/info/llort/torrent/util/PctmixWebParserV3.java b/src/main/java/info/llort/torrent/util/PctmixWebParserV3.java new file mode 100644 index 0000000..61ec1f2 --- /dev/null +++ b/src/main/java/info/llort/torrent/util/PctmixWebParserV3.java @@ -0,0 +1,156 @@ +package info.llort.torrent.util; + +import com.google.common.net.HttpHeaders; +import info.llort.torrent.bean.PageLinkInfo; +import net.lightbody.bmp.BrowserMobProxy; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.select.Elements; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.WebDriver; + +import java.io.IOException; +import java.time.Duration; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static org.fusesource.jansi.Ansi.Color.*; + +public class PctmixWebParserV3 { + public static void capture(String urlWebToParse, String geckoDriverPath, List filters, long downloadTimeOut, WebDriver driver, BrowserMobProxy proxy) throws IOException, InterruptedException { + Set mainPageLinks = findMainPageLinks(urlWebToParse, geckoDriverPath, filters, driver); + for (PageLinkInfo pli : mainPageLinks) { + Console.println("Main page link: " + pli.getUrl(), WHITE); + } + + Set torrentPageLinks = findPageTorrentLinks(mainPageLinks, driver, proxy); + for (PageLinkInfo pli : torrentPageLinks) { + Console.println("Torrent page link: " + pli.getUrl(), WHITE); + } + + Set downloadTorrentLinks = downloadTorrentLinks(torrentPageLinks, driver, proxy); + for (PageLinkInfo pli : downloadTorrentLinks) { + Console.println("Download link: " + pli.getUrl(), WHITE); + downloadTorrentFile(pli, driver, downloadTimeOut, proxy); + } + } + + public static Set findMainPageLinks(String url, String geckoDriverPath, List filters, WebDriver driver) throws IOException { + // Inspired by https://www.javatpoint.com/selenium-webdriver-running-test-on-firefox-browser-gecko-driver + driver.get(url); + + Set links = new HashSet<>(); + Document doc = Jsoup.parse(driver.getPageSource()); + Elements elements = doc.select("a[href]"); + for (Element element : elements) { + String value = element.attr("href"); + if (value.startsWith("https:")) { + if ((value.contains("/descargar/peliculas-castellano/") && value.contains("blurayrip")) || + (value.contains("/descargar/peliculas-x264-mkv/") && value.contains("bluray")) || + (value.contains("/descargar/cine-alta-definicion-hd/") && value.contains("bluray")) || + value.contains("/descargar/serie/") || + value.contains("/descargar/serie-en-hd/") || + value.contains("/descargar/serie-4k/")) { + for (String filter : filters) { + if (value.toLowerCase().contains(filter.toLowerCase())) { + // /descargar/ must be replaced by /descargar/torrent/ + PageLinkInfo pli = new PageLinkInfo(); + pli.setReferer(value); + value = value.replace("/descargar/", "/descargar/torrent/"); + pli.setUrl(value); + links.add(pli); + break; + } + } + } + } + } + + return links; + } + + public static Set findPageTorrentLinks(Set pageLinks, WebDriver driver, BrowserMobProxy proxy) throws IOException { + Set links = new HashSet<>(); + for (PageLinkInfo pli : pageLinks) { + // Setting referer before jump to the page + proxy.addHeader(HttpHeaders.REFERER, pli.getReferer()); + driver.get(pli.getUrl()); + + String htmlContent = driver.getPageSource(); + //System.out.println(htmlContent); + String regex = "window.open\\(\"(.*?)\"\\)"; + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(htmlContent); + while (matcher.find()) { + String tLink = matcher.group(1); + tLink = "https:" + tLink; // add https at the begining + PageLinkInfo newPli = new PageLinkInfo(); + newPli.setUrl(tLink); + newPli.setReferer(pli.getUrl()); + links.add(newPli); + } + } + return links; + } + + public static Set downloadTorrentLinks(Set pageLinks, WebDriver driver, BrowserMobProxy proxy) throws IOException { + Set links = new HashSet<>(); + for (PageLinkInfo pli : pageLinks) { + // Setting referer before jump to the page + proxy.addHeader(HttpHeaders.REFERER, pli.getReferer()); + driver.get(pli.getUrl()); + + // Capture int value + String bodySource = driver.getPageSource(); + String linkRegex = "(parseInt\\(\"(.*)\"\\);)"; + Pattern pattern = Pattern.compile(linkRegex); + Matcher matcher = pattern.matcher(bodySource); + if (matcher.find()) { + String intValue = matcher.group(2); + Console.println("intValue found: " + intValue, WHITE); + if (driver instanceof JavascriptExecutor) { + // Referer must be atomtt + proxy.addHeader(HttpHeaders.REFERER, pli.getUrl()); // atomtt referer !!! + // Javascript request to be executed by selenium + String js = "var values = {'t':'167632'};\n"; // the variable + js += "var xhr = new XMLHttpRequest();\n"; + js += "xhr.open('POST', 'https://atomtt.com/to.php', false);\n"; + js += "xhr.setRequestHeader('Content-type', 'application/x-www-form-urlencoded');\n"; + js += "xhr.send('t=" + intValue + "');\n"; + js += "return xhr.response;\n"; + Console.println("executing javascript: " + js, YELLOW); + Object result = ((JavascriptExecutor) driver).executeScript(js); + Console.println("Javascript result: " + js, GREEN); + String torrentFileLinkValue = "https://atomixhq.art/t_download/" + result + ".torrent"; + Console.println("torrentFileLinkValue: " + torrentFileLinkValue, GREEN); + PageLinkInfo newPli = new PageLinkInfo(); + newPli.setUrl(torrentFileLinkValue); + newPli.setReferer(pli.getUrl()); + links.add(newPli); + } + + } else { + Console.println("intValue NOT found for page: " + pli.getUrl(), RED); + } + } + return links; + } + + public static void downloadTorrentFile(PageLinkInfo pli, WebDriver driver, long timeOut, BrowserMobProxy proxy) throws InterruptedException { + try { + // Referer must be atomtt + proxy.addHeader(HttpHeaders.REFERER, pli.getReferer()); + // Set timeout otherwise the driver lock + driver.manage().timeouts().scriptTimeout(Duration.ofSeconds(timeOut)); + driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(timeOut)); + driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(timeOut)); + driver.navigate().to(pli.getUrl()); + } catch (Exception e) { + // Silent error + } + } +} diff --git a/src/test/java/info/jllort/torrent/ParserCheckIssue27.java b/src/test/java/info/jllort/torrent/ParserCheckIssue27.java new file mode 100644 index 0000000..b9f95b1 --- /dev/null +++ b/src/test/java/info/jllort/torrent/ParserCheckIssue27.java @@ -0,0 +1,108 @@ +package info.jllort.torrent; + +import com.google.common.net.HttpHeaders; +import info.llort.torrent.Config; +import net.lightbody.bmp.BrowserMobProxy; +import net.lightbody.bmp.BrowserMobProxyServer; +import net.lightbody.bmp.client.ClientUtil; +import net.lightbody.bmp.proxy.CaptureType; +import org.openqa.selenium.JavascriptExecutor; +import org.openqa.selenium.Proxy; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.firefox.FirefoxDriver; +import org.openqa.selenium.firefox.FirefoxOptions; +import org.openqa.selenium.firefox.FirefoxProfile; +import org.openqa.selenium.remote.CapabilityType; + +import java.net.Inet4Address; +import java.net.UnknownHostException; +import java.time.Duration; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class ParserCheckIssue27 { + public static void main(String[] args) throws UnknownHostException, InterruptedException { + String geckoDriverPath = Config.FIREFOX_DRIVER_PATH; + String dstPath = Config.FILESYSTEM_DOWNLOAD_PATH; + long timeOut = Config.FILE_DOWNLOAD_TIMEOUT; + + String referer = "https://atomixhq.art/descargar/torrent/peliculas-castellano/un-segundo-one-second--2022-/blurayrip-ac3-5-1/"; + String link = "https://atomtt.com/t_download/167641/un-segundo--one-second---2022-/"; + + // Creating proxy + BrowserMobProxy proxy = new BrowserMobProxyServer(); + proxy.start(8080); + Proxy seleniumProxy = ClientUtil.createSeleniumProxy(proxy); + + String hostIp = Inet4Address.getLocalHost().getHostAddress(); + seleniumProxy.setHttpProxy(hostIp + ":" + proxy.getPort()); + seleniumProxy.setSslProxy(hostIp + ":" + proxy.getPort()); + proxy.enableHarCaptureTypes(CaptureType.REQUEST_CONTENT, CaptureType.RESPONSE_CONTENT); + +// RequestFilter requestFilter = new RequestFilter() { +// @Override +// public HttpResponse filterRequest(HttpRequest httpRequest, HttpMessageContents httpMessageContents, HttpMessageInfo httpMessageInfo) { +// return null; +// } +// }; +// +// // put our custom header to each request +// proxy.addRequestFilter((request, contents, messageInfo)-> { +// request.headers().add(HttpHeaders.REFERER, referer); +// System.out.println(request.headers().entries().toString()); +// return null; +// }); + + proxy.addHeader(HttpHeaders.REFERER, referer); + + // Creating the driver + System.setProperty("webdriver.gecko.driver", geckoDriverPath); + // firefox profile to autosave + FirefoxOptions firefoxOptions = new FirefoxOptions(); + FirefoxProfile fxProfile = new FirefoxProfile(); + fxProfile.setPreference("browser.download.folderList", 2); + fxProfile.setPreference("browser.download.dir", dstPath); + fxProfile.setPreference("browser.helperApps.neverAsk.saveToDisk","application/octet-stream"); + fxProfile.setPreference("pdfjs.enabledCache.state",false); + firefoxOptions.setProfile(fxProfile); + // Setting the proxy + firefoxOptions.setCapability(CapabilityType.PROXY, seleniumProxy); + firefoxOptions.setCapability(CapabilityType.ACCEPT_SSL_CERTS, true); + + WebDriver driver = new FirefoxDriver(firefoxOptions); + + // Page with link + driver.get(link); + String bodySource = driver.getPageSource(); + String linkRegex = "(parseInt\\(\"(.*)\"\\);)"; + Pattern pattern = Pattern.compile(linkRegex); + Matcher matcher = pattern.matcher(bodySource); + if (matcher.find()) { + String intValue = matcher.group(2); + System.out.println("intValue found: " + intValue); + if (driver instanceof JavascriptExecutor) { + // Referer must be atomtt + proxy.addHeader(HttpHeaders.REFERER, "https://atomtt.com/t_download/167632/sharkwater--the-requin---2022-/"); + // Javascript request to be executed by selenium + String js = "var values = {'t':'167632'};\n"; // the variable + js += "var xhr = new XMLHttpRequest();\n"; + js += "xhr.open('POST', 'https://atomtt.com/to.php', false);\n"; + js += "xhr.setRequestHeader('Content-type', 'application/x-www-form-urlencoded');\n"; + js += "xhr.send('t=" + intValue + "');\n"; + js += "return xhr.response;\n"; + System.out.println(js); + Object result = ((JavascriptExecutor) driver).executeScript(js); + System.out.println("Result value of cal to top: " + result); +// result = ((JavascriptExecutor) driver).executeScript("alert('Welcome to Guru99');"); + String torrentFileLinkValue = "https://atomixhq.art/t_download/" + result + ".torrent"; + System.out.println(torrentFileLinkValue); + driver.manage().timeouts().scriptTimeout(Duration.ofSeconds(timeOut)); + driver.manage().timeouts().pageLoadTimeout(Duration.ofSeconds(timeOut)); + driver.manage().timeouts().implicitlyWait(Duration.ofSeconds(timeOut)); + // Other referer + proxy.addHeader(HttpHeaders.REFERER, link); // Referer is the page what contains the torrent link //atomtt etc... + driver.navigate().to(torrentFileLinkValue); + } + } + } +} diff --git a/src/test/java/info/jllort/torrent/RegexTest.java b/src/test/java/info/jllort/torrent/RegexTest.java new file mode 100644 index 0000000..1b08e1a --- /dev/null +++ b/src/test/java/info/jllort/torrent/RegexTest.java @@ -0,0 +1,35 @@ +package info.jllort.torrent; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RegexTest { + private static final String bodySample = "function openTorrent(u)\n" + + "{\n" + + "\n" + + "\t \n" + + "\n" + + "\tvar link = \"https://www.linkonclick.com/jump/next.php?r=5302219\";\n" + + "\n" + + "\twindow.open(link);\n" + + "\n" + + "\t\n" + + "\twindow.location.href = u;\t\n" + + "\n" + + "}\t\n" + + "\n" + + "\n" + + "var tid = parseInt(\"167641\");\n" + + "\n" + + "var btn = document.getElementById(\"btntor\");\n" + + "btn.addEventListener(\"click\", function() "; + + public static void main(String[] args) { + String linkRegex = "([\"]https://www.linkonclick.com/jump/next.*[\"])"; + Pattern pattern = Pattern.compile(linkRegex); + Matcher matcher = pattern.matcher(bodySample); + if (matcher.find()) { + System.out.println(matcher.group().replaceAll("\"","")); + } + } +} diff --git a/src/test/java/info/jllort/torrent/RegexTest2.java b/src/test/java/info/jllort/torrent/RegexTest2.java new file mode 100644 index 0000000..fc55571 --- /dev/null +++ b/src/test/java/info/jllort/torrent/RegexTest2.java @@ -0,0 +1,25 @@ +package info.jllort.torrent; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class RegexTest2 { + private static final String bodySample = "function openTorrent(u)\n" + + "}\t\n" + + "\n" + + "\n" + + "var tid = parseInt(\"167632\");\n" + + "\n" + + "var btn = document.getElementById(\"btntor\");\n" + + "btn.addEventListener(\"click\", function() "; + + public static void main(String[] args) { + String linkRegex = "(parseInt\\(\"(.*)\"\\);)"; + Pattern pattern = Pattern.compile(linkRegex); + Matcher matcher = pattern.matcher(bodySample); + if (matcher.find()) { + String value = matcher.group(2); + System.out.println(value); + } + } +}