Skip to content

Commit

Permalink
Merge pull request #21 from D4Vinci/dev
Browse files Browse the repository at this point in the history
v0.2.8
  • Loading branch information
D4Vinci authored Nov 30, 2024
2 parents d6cc07d + c481a1c commit 012820c
Show file tree
Hide file tree
Showing 32 changed files with 186 additions and 137 deletions.
2 changes: 2 additions & 0 deletions .bandit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ skips:
- B311
- B320
- B410
- B113 # `Requests call without timeout` these requests are done in the benchmark and examples scripts only
- B403 # We are using pickle for tests only
4 changes: 2 additions & 2 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
ignore = E501 # line too long
exclude = .git,__pycache__,docs,.github,build,dist
ignore = E501, F401
exclude = .git,.venv,__pycache__,docs,.github,build,dist,tests,benchmarks.py
6 changes: 5 additions & 1 deletion .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
name: Tests
on: [push]
on:
push:
branches:
- main
- dev

concurrency:
group: ${{github.workflow}}-${{ github.ref }}
Expand Down
11 changes: 8 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
repos:
- repo: https://github.com/PyCQA/bandit
rev: 1.7.8
rev: 1.8.0
hooks:
- id: bandit
args: [-r, -c, .bandit.yml]
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
rev: 7.1.1
hooks:
- id: flake8
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
- id: isort
- id: isort
- repo: https://github.com/netromdk/vermin
rev: v1.6.0
hooks:
- id: vermin
args: ['-t=3.8-', '--violations', '--eval-annotations', '--no-tips']
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.

```python
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
# Fetch websites' source under the radar!
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
>> print(page.status)
Expand Down Expand Up @@ -223,7 +223,7 @@ All of them can take these initialization arguments: `auto_match`, `huge_tree`,
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
```python
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
```
then use it right away without initializing like:
```python
Expand Down
15 changes: 8 additions & 7 deletions benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import functools
import time
import timeit
import functools
import requests
from statistics import mean

from scrapling import Adaptor
from parsel import Selector
from lxml import etree, html
import requests
from autoscraper import AutoScraper
from bs4 import BeautifulSoup
from lxml import etree, html
from mechanicalsoup import StatefulBrowser
from parsel import Selector
from pyquery import PyQuery as pq
from autoscraper import AutoScraper
from selectolax.parser import HTMLParser
from mechanicalsoup import StatefulBrowser

from scrapling import Adaptor

large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'

Expand Down
42 changes: 42 additions & 0 deletions cleanup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import shutil
from pathlib import Path


# Clean up after installing for local development
def clean():
# Get the current directory
base_dir = Path.cwd()

# Directories and patterns to clean
cleanup_patterns = [
'build',
'dist',
'*.egg-info',
'__pycache__',
'.eggs',
'.pytest_cache'
]

# Clean directories
for pattern in cleanup_patterns:
for path in base_dir.glob(pattern):
try:
if path.is_dir():
shutil.rmtree(path)
else:
path.unlink()
print(f"Removed: {path}")
except Exception as e:
print(f"Could not remove {path}: {e}")

# Remove compiled Python files
for path in base_dir.rglob('*.py[co]'):
try:
path.unlink()
print(f"Removed compiled file: {path}")
except Exception as e:
print(f"Could not remove {path}: {e}")


if __name__ == '__main__':
clean()
2 changes: 1 addition & 1 deletion docs/Examples/selectorless_stackoverflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import requests

from scrapling import Adaptor

response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
Expand All @@ -22,4 +23,3 @@
# We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
print(i, title.text, author.text)

7 changes: 4 additions & 3 deletions scrapling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
# Declare top-level shortcuts
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
from scrapling.core.custom_types import AttributesHandler, TextHandler
from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
StealthyFetcher)
from scrapling.parser import Adaptor, Adaptors
from scrapling.core.custom_types import TextHandler, AttributesHandler

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2.7"
__version__ = "0.2.8"
__copyright__ = "Copyright (c) 2024 Karim Shoair"


Expand Down
5 changes: 2 additions & 3 deletions scrapling/core/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
Type definitions for type checking purposes.
"""

from typing import (
Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
)
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
List, Literal, Optional, Pattern, Tuple, Type, Union)

try:
from typing import Protocol
Expand Down
10 changes: 5 additions & 5 deletions scrapling/core/custom_types.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import re
from types import MappingProxyType
from collections.abc import Mapping
from types import MappingProxyType

from scrapling.core.utils import _is_iterable, flatten
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex

from orjson import loads, dumps
from orjson import dumps, loads
from w3lib.html import replace_entities as _replace_entities

from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
from scrapling.core.utils import _is_iterable, flatten


class TextHandler(str):
"""Extends standard Python string by adding more functionality"""
Expand Down
12 changes: 6 additions & 6 deletions scrapling/core/storage_adaptors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import orjson
import sqlite3
import logging
import sqlite3
import threading
from hashlib import sha256
from abc import ABC, abstractmethod
from hashlib import sha256

from scrapling.core._types import Dict, Optional, Union
from scrapling.core.utils import _StorageTools, cache

import orjson
from lxml import html
from tldextract import extract as tld

from scrapling.core._types import Dict, Optional, Union
from scrapling.core.utils import _StorageTools, cache


class StorageSystemMixin(ABC):
# If you want to make your own storage system, you have to inherit from this
Expand Down
11 changes: 5 additions & 6 deletions scrapling/core/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@

import re

from w3lib.html import HTML5_WHITESPACE
from scrapling.core.utils import cache
from scrapling.core._types import Any, Optional, Protocol, Self

from cssselect.xpath import ExpressionError
from cssselect.xpath import XPathExpr as OriginalXPathExpr
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from cssselect.xpath import ExpressionError
from cssselect.xpath import XPathExpr as OriginalXPathExpr
from w3lib.html import HTML5_WHITESPACE

from scrapling.core._types import Any, Optional, Protocol, Self
from scrapling.core.utils import cache

regex = f"[{HTML5_WHITESPACE}]+"
replace_html5_whitespaces = re.compile(regex).sub
Expand Down
27 changes: 15 additions & 12 deletions scrapling/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import re
import logging
import re
from itertools import chain
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache

from scrapling.core._types import Dict, Iterable, Any, Union

import orjson
from lxml import html

from scrapling.core._types import Any, Dict, Iterable, Union

# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
from functools import lru_cache as cache # isort:skip


html_forbidden = {html.HtmlComment, }
logging.basicConfig(
level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler()
]
)
level=logging.ERROR,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler()
]
)


def is_jsonable(content: Union[bytes, str]) -> bool:
Expand Down Expand Up @@ -94,7 +97,7 @@ def _get_element_path(cls, element: html.HtmlElement):
parent = element.getparent()
return tuple(
(element.tag,) if parent is None else (
cls._get_element_path(parent) + (element.tag,)
cls._get_element_path(parent) + (element.tag,)
)
)

Expand Down
2 changes: 1 addition & 1 deletion scrapling/defaults.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher

# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
Fetcher = Fetcher()
Expand Down
4 changes: 2 additions & 2 deletions scrapling/engines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .camo import CamoufoxEngine
from .static import StaticEngine
from .pw import PlaywrightEngine
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
from .pw import PlaywrightEngine
from .static import StaticEngine
from .toolbelt import check_if_engine_usable

__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
20 changes: 8 additions & 12 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
import logging
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal

from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
get_os_name,
intercept_route,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)

from camoufox import DefaultAddons
from camoufox.sync_api import Camoufox

from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
Union)
from scrapling.engines.toolbelt import (Response, StatusText,
check_type_validity,
construct_proxy_dict, do_nothing,
generate_convincing_referer,
get_os_name, intercept_route)


class CamoufoxEngine:
def __init__(
Expand Down
23 changes: 9 additions & 14 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
import json
import logging
from scrapling.core._types import Union, Callable, Optional, List, Dict

from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
js_bypass_path,
intercept_route,
generate_headers,
construct_cdp_url,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)
from scrapling.core._types import Callable, Dict, List, Optional, Union
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
NSTBROWSER_DEFAULT_QUERY)
from scrapling.engines.toolbelt import (Response, StatusText,
check_type_validity, construct_cdp_url,
construct_proxy_dict, do_nothing,
generate_convincing_referer,
generate_headers, intercept_route,
js_bypass_path)


class PlaywrightEngine:
Expand Down
7 changes: 4 additions & 3 deletions scrapling/engines/static.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging

from scrapling.core._types import Union, Optional, Dict
from .toolbelt import Response, generate_convincing_referer, generate_headers

import httpx
from httpx._models import Response as httpxResponse

from scrapling.core._types import Dict, Optional, Union

from .toolbelt import Response, generate_convincing_referer, generate_headers


class StaticEngine:
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
Expand Down
Loading

0 comments on commit 012820c

Please sign in to comment.