From 4e2036d3b9e3018b775d8ba99a56ce5321456143 Mon Sep 17 00:00:00 2001 From: ACA Date: Sun, 3 Mar 2024 12:59:05 +0100 Subject: [PATCH] cleaner: keep HTML table structure more intact --- lncrawl/core/cleaner.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lncrawl/core/cleaner.py b/lncrawl/core/cleaner.py index b0c8e131c..eebf1df87 100644 --- a/lncrawl/core/cleaner.py +++ b/lncrawl/core/cleaner.py @@ -138,6 +138,13 @@ def __init__(self) -> None: # the attributes to keep while cleaning a tag "src", "style", + # table and table children attributes + "colspan", + "rowspan", + "headers", + "scope", + "axis", + "id", # id required for headers ref ] ) self.whitelist_css_property: Set[str] = set(