Merge pull request #21 from D4Vinci/dev

v0.2.8
D4Vinci · Nov 30, 2024 · 012820c · 012820c
2 parents d6cc07d + c481a1c
commit 012820c
Show file tree

Hide file tree

Showing 32 changed files with 186 additions and 137 deletions.
diff --git a/.bandit.yml b/.bandit.yml
@@ -3,3 +3,5 @@ skips:
 - B311
 - B320
 - B410
+- B113  # `Requests call without timeout` these requests are done in the benchmark and examples scripts only
+- B403  # We are using pickle for tests only
diff --git a/.flake8 b/.flake8
@@ -1,3 +1,3 @@
 [flake8]
-ignore = E501  # line too long
-exclude = .git,__pycache__,docs,.github,build,dist
+ignore = E501, F401
+exclude = .git,.venv,__pycache__,docs,.github,build,dist,tests,benchmarks.py
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -1,5 +1,9 @@
 name: Tests
-on: [push]
+on:
+  push:
+    branches:
+      - main
+      - dev
 
 concurrency:
   group: ${{github.workflow}}-${{ github.ref }}

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,14 +1,19 @@
 repos:
 - repo: https://github.com/PyCQA/bandit
-  rev: 1.7.8
+  rev: 1.8.0
   hooks:
   - id: bandit
     args: [-r, -c, .bandit.yml]
 - repo: https://github.com/PyCQA/flake8
-  rev: 7.0.0
+  rev: 7.1.1
   hooks:
   - id: flake8
 - repo: https://github.com/pycqa/isort
   rev: 5.13.2
   hooks:
-  - id: isort
+  - id: isort
+- repo: https://github.com/netromdk/vermin
+  rev: v1.6.0
+  hooks:
+  - id: vermin
+    args: ['-t=3.8-', '--violations', '--eval-annotations', '--no-tips']
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
 Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
 
 ```python
->> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
 # Fetch websites' source under the radar!
 >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
 >> print(page.status)
@@ -223,7 +223,7 @@ All of them can take these initialization arguments: `auto_match`, `huge_tree`,
 
 If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
 ```python
-from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
+from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
 ```
 then use it right away without initializing like:
 ```python

diff --git a/benchmarks.py b/benchmarks.py
@@ -1,17 +1,18 @@
+import functools
 import time
 import timeit
-import functools
-import requests
 from statistics import mean
 
-from scrapling import Adaptor
-from parsel import Selector
-from lxml import etree, html
+import requests
+from autoscraper import AutoScraper
 from bs4 import BeautifulSoup
+from lxml import etree, html
+from mechanicalsoup import StatefulBrowser
+from parsel import Selector
 from pyquery import PyQuery as pq
-from autoscraper import AutoScraper
 from selectolax.parser import HTMLParser
-from mechanicalsoup import StatefulBrowser
+
+from scrapling import Adaptor
 
 large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'
 

diff --git a/cleanup.py b/cleanup.py
@@ -0,0 +1,42 @@
+import shutil
+from pathlib import Path
+
+
+# Clean up after installing for local development
+def clean():
+    # Get the current directory
+    base_dir = Path.cwd()
+
+    # Directories and patterns to clean
+    cleanup_patterns = [
+        'build',
+        'dist',
+        '*.egg-info',
+        '__pycache__',
+        '.eggs',
+        '.pytest_cache'
+    ]
+
+    # Clean directories
+    for pattern in cleanup_patterns:
+        for path in base_dir.glob(pattern):
+            try:
+                if path.is_dir():
+                    shutil.rmtree(path)
+                else:
+                    path.unlink()
+                print(f"Removed: {path}")
+            except Exception as e:
+                print(f"Could not remove {path}: {e}")
+
+    # Remove compiled Python files
+    for path in base_dir.rglob('*.py[co]'):
+        try:
+            path.unlink()
+            print(f"Removed compiled file: {path}")
+        except Exception as e:
+            print(f"Could not remove {path}: {e}")
+
+
+if __name__ == '__main__':
+    clean()
diff --git a/docs/Examples/selectorless_stackoverflow.py b/docs/Examples/selectorless_stackoverflow.py
@@ -4,6 +4,7 @@
 """
 
 import requests
+
 from scrapling import Adaptor
 
 response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
@@ -22,4 +23,3 @@
     # We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
     for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
         print(i, title.text, author.text)
-
diff --git a/scrapling/__init__.py b/scrapling/__init__.py
@@ -1,10 +1,11 @@
 # Declare top-level shortcuts
-from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
+from scrapling.core.custom_types import AttributesHandler, TextHandler
+from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
+                                StealthyFetcher)
 from scrapling.parser import Adaptor, Adaptors
-from scrapling.core.custom_types import TextHandler, AttributesHandler
 
 __author__ = "Karim Shoair ([email protected])"
-__version__ = "0.2.7"
+__version__ = "0.2.8"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
 

diff --git a/scrapling/core/_types.py b/scrapling/core/_types.py
@@ -2,9 +2,8 @@
 Type definitions for type checking purposes.
 """
 
-from typing import (
-    Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
-)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
+                    List, Literal, Optional, Pattern, Tuple, Type, Union)
 
 try:
     from typing import Protocol

diff --git a/scrapling/core/custom_types.py b/scrapling/core/custom_types.py
@@ -1,13 +1,13 @@
 import re
-from types import MappingProxyType
 from collections.abc import Mapping
+from types import MappingProxyType
 
-from scrapling.core.utils import _is_iterable, flatten
-from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
-
-from orjson import loads, dumps
+from orjson import dumps, loads
 from w3lib.html import replace_entities as _replace_entities
 
+from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
+from scrapling.core.utils import _is_iterable, flatten
+
 
 class TextHandler(str):
     """Extends standard Python string by adding more functionality"""

diff --git a/scrapling/core/storage_adaptors.py b/scrapling/core/storage_adaptors.py
@@ -1,16 +1,16 @@
-import orjson
-import sqlite3
 import logging
+import sqlite3
 import threading
-from hashlib import sha256
 from abc import ABC, abstractmethod
+from hashlib import sha256
 
-from scrapling.core._types import Dict, Optional, Union
-from scrapling.core.utils import _StorageTools, cache
-
+import orjson
 from lxml import html
 from tldextract import extract as tld
 
+from scrapling.core._types import Dict, Optional, Union
+from scrapling.core.utils import _StorageTools, cache
+
 
 class StorageSystemMixin(ABC):
     # If you want to make your own storage system, you have to inherit from this

diff --git a/scrapling/core/translator.py b/scrapling/core/translator.py
@@ -10,15 +10,14 @@
 
 import re
 
-from w3lib.html import HTML5_WHITESPACE
-from scrapling.core.utils import cache
-from scrapling.core._types import Any, Optional, Protocol, Self
-
-from cssselect.xpath import ExpressionError
-from cssselect.xpath import XPathExpr as OriginalXPathExpr
 from cssselect import HTMLTranslator as OriginalHTMLTranslator
 from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
+from cssselect.xpath import ExpressionError
+from cssselect.xpath import XPathExpr as OriginalXPathExpr
+from w3lib.html import HTML5_WHITESPACE
 
+from scrapling.core._types import Any, Optional, Protocol, Self
+from scrapling.core.utils import cache
 
 regex = f"[{HTML5_WHITESPACE}]+"
 replace_html5_whitespaces = re.compile(regex).sub

diff --git a/scrapling/core/utils.py b/scrapling/core/utils.py
@@ -1,22 +1,25 @@
-import re
 import logging
+import re
 from itertools import chain
-# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
-from functools import lru_cache as cache  # functools.cache is available on Python 3.9+ only so let's keep lru_cache
-
-from scrapling.core._types import Dict, Iterable, Any, Union
 
 import orjson
 from lxml import html
 
+from scrapling.core._types import Any, Dict, Iterable, Union
+
+# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
+# functools.cache is available on Python 3.9+ only so let's keep lru_cache
+from functools import lru_cache as cache  # isort:skip
+
+
 html_forbidden = {html.HtmlComment, }
 logging.basicConfig(
-        level=logging.ERROR,
-        format='%(asctime)s - %(levelname)s - %(message)s',
-        handlers=[
-            logging.StreamHandler()
-        ]
-    )
+    level=logging.ERROR,
+    format='%(asctime)s - %(levelname)s - %(message)s',
+    handlers=[
+        logging.StreamHandler()
+    ]
+)
 
 
 def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -94,7 +97,7 @@ def _get_element_path(cls, element: html.HtmlElement):
         parent = element.getparent()
         return tuple(
             (element.tag,) if parent is None else (
-                    cls._get_element_path(parent) + (element.tag,)
+                cls._get_element_path(parent) + (element.tag,)
             )
         )
 

diff --git a/scrapling/defaults.py b/scrapling/defaults.py
@@ -1,4 +1,4 @@
-from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
+from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
 
 # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
 Fetcher = Fetcher()

diff --git a/scrapling/engines/__init__.py b/scrapling/engines/__init__.py
@@ -1,7 +1,7 @@
 from .camo import CamoufoxEngine
-from .static import StaticEngine
-from .pw import PlaywrightEngine
 from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
+from .pw import PlaywrightEngine
+from .static import StaticEngine
 from .toolbelt import check_if_engine_usable
 
 __all__ = ['CamoufoxEngine', 'PlaywrightEngine']
diff --git a/scrapling/engines/camo.py b/scrapling/engines/camo.py
@@ -1,20 +1,16 @@
 import logging
-from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
-
-from scrapling.engines.toolbelt import (
-    Response,
-    do_nothing,
-    StatusText,
-    get_os_name,
-    intercept_route,
-    check_type_validity,
-    construct_proxy_dict,
-    generate_convincing_referer,
-)
 
 from camoufox import DefaultAddons
 from camoufox.sync_api import Camoufox
 
+from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
+                                   Union)
+from scrapling.engines.toolbelt import (Response, StatusText,
+                                        check_type_validity,
+                                        construct_proxy_dict, do_nothing,
+                                        generate_convincing_referer,
+                                        get_os_name, intercept_route)
+
 
 class CamoufoxEngine:
     def __init__(

diff --git a/scrapling/engines/pw.py b/scrapling/engines/pw.py
@@ -1,20 +1,15 @@
 import json
 import logging
-from scrapling.core._types import Union, Callable, Optional, List, Dict
 
-from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
-from scrapling.engines.toolbelt import (
-    Response,
-    do_nothing,
-    StatusText,
-    js_bypass_path,
-    intercept_route,
-    generate_headers,
-    construct_cdp_url,
-    check_type_validity,
-    construct_proxy_dict,
-    generate_convincing_referer,
-)
+from scrapling.core._types import Callable, Dict, List, Optional, Union
+from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
+                                         NSTBROWSER_DEFAULT_QUERY)
+from scrapling.engines.toolbelt import (Response, StatusText,
+                                        check_type_validity, construct_cdp_url,
+                                        construct_proxy_dict, do_nothing,
+                                        generate_convincing_referer,
+                                        generate_headers, intercept_route,
+                                        js_bypass_path)
 
 
 class PlaywrightEngine:

diff --git a/scrapling/engines/static.py b/scrapling/engines/static.py
@@ -1,11 +1,12 @@
 import logging
 
-from scrapling.core._types import Union, Optional, Dict
-from .toolbelt import Response, generate_convincing_referer, generate_headers
-
 import httpx
 from httpx._models import Response as httpxResponse
 
+from scrapling.core._types import Dict, Optional, Union
+
+from .toolbelt import Response, generate_convincing_referer, generate_headers
+
 
 class StaticEngine:
     def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):