Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introducing Accepting Cookie Consent lite #483

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 180 additions & 5 deletions apps/workers/crawlerWorker.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import assert from "assert";
import * as dns from "dns";
import * as path from "node:path";
import type { Browser } from "puppeteer";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
Expand All @@ -18,6 +17,7 @@ import metascraperReadability from "metascraper-readability";
import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import { Browser, Frame, Page } from "puppeteer";
import puppeteer from "puppeteer-extra";
import AdblockerPlugin from "puppeteer-extra-plugin-adblocker";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
Expand Down Expand Up @@ -264,17 +264,23 @@ async function crawlPage(jobId: string, url: string) {

await page.goto(url, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
waitUntil: "networkidle0",
});

logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
);

// Wait until there's at most two connections for 2 seconds
// Attempt to wait only for 5 seconds
logger.info(`[Crawler][${jobId}] Clicking Cookie Consent Banner.`);
await acceptCookies(page, jobId);

logger.info(`[Crawler][${jobId}] Hiding Consent Banner if Still Visible.`);
await hideConsentBanner(page, jobId);

await Promise.race([
page.waitForNetworkIdle({
idleTime: 1000, // 1 sec
concurrency: 2,
idleTime: 2000, // Wait for 2 seconds of no significant nework activity
concurrency: 0, // No active network connections
}),
new Promise((f) => setTimeout(f, 5000)),
]);
Expand Down Expand Up @@ -303,6 +309,175 @@ async function crawlPage(jobId: string, url: string) {
}
}

async function hideConsentBanner(page: Page, jobId: string) {
// Hide banners in the main document
await applyHideConsentBanner(page);

// Hide banners in all iframes
const frames = page.frames();
for (const frame of frames) {
if (frame !== page.mainFrame()) {
try {
await applyHideConsentBanner(frame);
} catch (error) {
if (error instanceof Error) {
logger.warn(
`[Crawler][${jobId}] Unable to hide consent banner in frame "${frame.url()}": ${error.message}`,
);
} else {
logger.warn(
`[Crawler][${jobId}] Unknown error occurred while hiding consent banner in frame "${frame.url()}": ${String(error)}`,
);
}
}
}
}
}

async function applyHideConsentBanner(frame: Frame | Page) {
await frame.evaluate(() => {
const style = document.createElement("style");
style.textContent = `
.cookie-banner, .consent-banner, .gdpr-banner, .cookie-consent, .cookie-notice,
.cookie-popup, .cookie-policy-banner, .cookie-warning, .cookie-bar, .cookie-message,
.cookie-container, .cookie-acceptance, .cookie-disclaimer, .cookie-info, .cookie-overlay, .cmp-banner,
#cookieBanner, #cookieConsent, #cookieNotice, #cookiePolicy, #gdprBanner, #consentPopup, #privacyBanner {
display: none !important;
}
`;
document.head.appendChild(style);
});
}

async function acceptCookies(page: Page, jobId: string) {
const cookieKeywords = [
"accept",
"agree",
"got it",
"consent",
"allow",
"enable",
"continue",
"accept all",
"agree and proceed",
"confirm",
"accept cookies",
"accept and close",
"I accept",
"I agree", // English
"akzeptieren",
"zustimmen",
"alle akzeptieren",
"alle zustimmen",
"erlauben",
"weiter",
"zustimmen und fortfahren",
"zustimmen und weiter",
"bestätigen",
"cookies akzeptieren",
"ich akzeptiere",
"ich stimme zu",
"akzeptieren und schließen",
"akzeptieren und weiter", // German
"accepter",
"continuer",
"autoriser", // French
"aceptar",
"continuar",
"permitir", // Spanish
"accetta",
"consenti",
"continua", // Italian
];

try {
// Attempt to click consent buttons in the main frame
const clicked = await clickConsentButton(page, cookieKeywords, jobId);
if (clicked) return;

// If no buttons were found in the main frame, check all iframes
const frames = page.frames();
for (const frame of frames) {
if (frame !== page.mainFrame()) {
try {
const frameClicked = await clickConsentButton(
frame,
cookieKeywords,
jobId,
);
if (frameClicked) return;
} catch (error) {
if (error instanceof Error) {
logger.warn(
`[Crawler][${jobId}] Unable to access frame "${frame.url()}": ${error.message}`,
);
} else {
logger.warn(
`[Crawler][${jobId}] Unknown error occurred while accessing frame "${frame.url()}": ${String(error)}`,
);
}
}
}
}
logger.warn(`[Crawler][${jobId}] No matching cookie consent button found.`);
} catch (error) {
if (error instanceof Error) {
logger.error(
`[Crawler][${jobId}] Error occurred while attempting to accept cookies: ${error.message}`,
);
} else {
logger.error(
`[Crawler][${jobId}] Unknown error occurred while attempting to accept cookies: ${String(error)}`,
);
}
}
logger.info(`[Crawler][${jobId}] Finished acceptCookies function.`);
}

async function clickConsentButton(
frame: Frame | Page,
cookieKeywords: string[],
jobId: string,
) {
const elements = await frame.$$("button, a, span");
for (const element of elements) {
const text = await element.evaluate(
(el) => el.textContent?.toLowerCase().trim() ?? "",
);
const matchedKeyword = cookieKeywords.find((keyword) => text === keyword);
if (matchedKeyword) {
try {
const isVisible = await element.isIntersectingViewport({
threshold: 0.5,
});
if (!isVisible) {
continue; // Skip if element is not visible
}
await frame.evaluate((el) => {
el.scrollIntoView({ behavior: "smooth", block: "center" });
el.click();
}, element);
logger.info(
`[Crawler][${jobId}] Clicked cookie consent button with text: "${text}" using keyword: "${matchedKeyword}".`,
);
await new Promise((resolve) => setTimeout(resolve, 1000));
return true;
} catch (error) {
if (error instanceof Error) {
logger.error(
`[Crawler][${jobId}] Failed to click the consent button with text: "${text}". Error: ${error.message}`,
);
} else {
logger.error(
`[Crawler][${jobId}] Unknown error occurred while clicking the consent button with text: "${text}". Error: ${String(error)}`,
);
}
}
}
}
return false;
}

async function extractMetadata(
htmlContent: string,
url: string,
Expand Down