From 3231c830a08e04b998d576f67797677f5fd6d9bd Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 7 May 2024 17:38:32 +0200 Subject: [PATCH 001/169] better listing --- python/dolma/cli/__main__.py | 5 +-- python/dolma/cli/listers.py | 60 +++++++++++++++++++++++++++++++++++ python/dolma/cli/resolvers.py | 37 ++++++++++++++------- python/dolma/cli/tagger.py | 34 -------------------- python/dolma/core/registry.py | 43 ++++++++++++++++++------- 5 files changed, 121 insertions(+), 58 deletions(-) create mode 100644 python/dolma/cli/listers.py diff --git a/python/dolma/cli/__main__.py b/python/dolma/cli/__main__.py index 3b4ea842..2cd25923 100644 --- a/python/dolma/cli/__main__.py +++ b/python/dolma/cli/__main__.py @@ -13,7 +13,8 @@ # must import these to register the resolvers from .resolvers import * # noqa: F401,F403,W0401 -from .tagger import ListTaggerCli, TaggerCli +from .tagger import TaggerCli +from .listers import ListerCli from .tokenizer import TokenizerCli from .warc import WarcExtractorCli @@ -21,7 +22,7 @@ "dedupe": DeduperCli, "mix": MixerCli, "tag": TaggerCli, - "list": ListTaggerCli, + "list": ListerCli, "stat": AnalyzerCli, "tokens": TokenizerCli, "warc": WarcExtractorCli, diff --git a/python/dolma/cli/listers.py b/python/dolma/cli/listers.py new file mode 100644 index 00000000..ba6fae3e --- /dev/null +++ b/python/dolma/cli/listers.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass +from typing import List, Optional + +from rich.console import Console +from rich.table import Table + +from dolma.cli import BaseCli, field +from dolma.core.loggers import get_logger +from dolma.core.registry import BaseRegistry +from dolma.core.utils import import_modules + + +@dataclass +class ListerConfig: + modules : List[str] = field( + default=[], + help="List of Python modules $PYTHONPATH to import custom registry modules from.", + ) + tagger_modules: Optional[List[str]] = field( + default=None, + help="List of Python modules $PYTHONPATH to import custom taggers from.", + ) + filter: Optional[str] = field( + default=None, + help="Filter which registries to list.", + ) + + +class ListerCli(BaseCli): + CONFIG = ListerConfig + DESCRIPTION = "List all available modules in registry." + + @classmethod + def run(cls, parsed_config: ListerConfig): + if parsed_config.tagger_modules is not None: + # deprecation warning + logger = get_logger(__file__) + logger.warning( + "The `tagger_modules` argument is deprecated and will be removed in a future release. " + "Please use `modules` instead." + ) + parsed_config.modules.extend(parsed_config.tagger_modules) + + # import tagger modules + import_modules(parsed_config.modules) + + for tagger_name, tagger_cls in BaseRegistry.registries(): + if parsed_config.filter is not None and parsed_config.filter.lower() not in tagger_name.lower(): + continue + + table = Table(title=tagger_name, style="bold") + table.add_column("name", justify="left", style="cyan") + table.add_column("class", justify="left", style="magenta") + + for tagger_name, tagger_cls in sorted(tagger_cls.items()): + tagger_repr = f"{tagger_cls.__module__}.{tagger_cls.__name__}" + table.add_row(tagger_name, tagger_repr) + + console = Console() + console.print(table) diff --git a/python/dolma/cli/resolvers.py b/python/dolma/cli/resolvers.py index ad799619..9c859052 100644 --- a/python/dolma/cli/resolvers.py +++ b/python/dolma/cli/resolvers.py @@ -1,36 +1,51 @@ import multiprocessing -from typing import List, TypeVar +import sys +from typing import Callable, List, Optional, TypeVar from cached_path import cached_path from omegaconf.omegaconf import OmegaConf as om -from omegaconf.omegaconf import Resolver from ..core.paths import glob_path +from ..core.registry import BaseRegistry -__all__ = ["cache", "glob", "processes"] +C = TypeVar("C", bound=Callable) -C = TypeVar("C", bound=Resolver) +class ResolverRegistry(BaseRegistry[Callable]): + @classmethod + def add(cls, name: str, desc: Optional[str] = None) -> Callable[[C], C]: + _add_fn = super().add(name, desc) -def resolver(resolver: C) -> C: - resolver_name = f"d.{resolver.__name__}" - om.register_new_resolver(resolver_name, resolver, replace=True) - return resolver + def _wrapped_add_fn( + resolver: C, + base_add_fn: C = _add_fn, # type: ignore + resolver_name: str = name, + ) -> C: + base_add_fn(resolver) + resolver_name = f"d.{resolver_name}" + om.register_new_resolver(resolver_name, resolver, replace=True) + return resolver + return _wrapped_add_fn -@resolver +@ResolverRegistry.add("cache", "Download a file and replace the path with the cached path.") def cache(path: str) -> str: return str(cached_path(path)) -@resolver +@ResolverRegistry.add("glob", "Glob this path and return a list of files.") def glob(path: str) -> List[str]: globbed = list(glob_path(path)) assert len(globbed) > 0, f"Path {path} does not match any files" return globbed -@resolver +@ResolverRegistry.add("processes", "Return the number of processes available (optionally with buffer).") def processes(n: int = 0) -> int: return max(1, multiprocessing.cpu_count() - n) + + +@ResolverRegistry.add("stdin", "Read from stdin and return list of paths.") +def stdin() -> List[str]: + return [line.strip() for line in sys.stdin] diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py index 9982ec05..90b3aa84 100644 --- a/python/dolma/cli/tagger.py +++ b/python/dolma/cli/tagger.py @@ -2,17 +2,12 @@ from pstats import SortKey from typing import List, Optional -from rich.console import Console -from rich.table import Table - from dolma.cli import BaseCli, field, print_config from dolma.cli.shared import WorkDirConfig, make_workdirs from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path -from dolma.core.registry import TaggerRegistry from dolma.core.runtime import create_and_run_tagger -from dolma.core.utils import import_modules @dataclass @@ -141,32 +136,3 @@ def run(cls, parsed_config: TaggerConfig): profile_steps=parsed_config.profile.steps, profile_sort_key=parsed_config.profile.sort_key, ) - - -@dataclass -class ListTaggerConfig: - tagger_modules: List[str] = field( - default=[], - help="List of Python modules $PYTHONPATH to import custom taggers from.", - ) - - -class ListTaggerCli(BaseCli): - CONFIG = ListTaggerConfig - DESCRIPTION = "List available taggers." - - @classmethod - def run(cls, parsed_config: ListTaggerConfig): - # import tagger modules - import_modules(parsed_config.tagger_modules) - - table = Table(title="dolma taggers", style="bold") - table.add_column("name", justify="left", style="cyan") - table.add_column("class", justify="left", style="magenta") - - for tagger_name, tagger_cls in sorted(TaggerRegistry.items()): - tagger_repr = f"{tagger_cls.__module__}.{tagger_cls.__name__}" - table.add_row(tagger_name, tagger_repr) - - console = Console() - console.print(table) diff --git a/python/dolma/core/registry.py b/python/dolma/core/registry.py index cc132627..9d6cc3ea 100644 --- a/python/dolma/core/registry.py +++ b/python/dolma/core/registry.py @@ -1,18 +1,30 @@ -from typing import Callable, Dict, Generator, Generic, Tuple, Type, TypeVar +from typing import Callable, Dict, Generator, Generic, Optional, Tuple, Type, TypeVar from .taggers import BaseTagger -T = TypeVar("T", bound=Type) -R = TypeVar("R", bound=Type) +T = TypeVar("T") +R = TypeVar("R") class BaseRegistry(Generic[T]): """A registry for objects.""" - _registry_storage: Dict[str, T] + _registry_of_registries: Dict[str, Type["BaseRegistry"]] = {} + _registry_storage: Dict[str, Tuple[T, Optional[str]]] @classmethod - def _get_storage(cls) -> Dict[str, T]: + def _add_to_registry_of_registries(cls) -> None: + name = cls.__name__ + if name not in cls._registry_of_registries: + cls._registry_of_registries[name] = cls + + @classmethod + def registries(cls) -> Generator[Tuple[str, Type["BaseRegistry"]], None, None]: + """Yield all registries in the registry of registries.""" + yield from sorted(cls._registry_of_registries.items()) + + @classmethod + def _get_storage(cls) -> Dict[str, Tuple[T, Optional[str]]]: if not hasattr(cls, "_registry_storage"): cls._registry_storage = {} return cls._registry_storage # pyright: ignore @@ -20,20 +32,28 @@ def _get_storage(cls) -> Dict[str, T]: @classmethod def items(cls) -> Generator[Tuple[str, T], None, None]: """Yield all items in the registry.""" - yield from sorted(cls._get_storage().items()) + yield from sorted((n, t) for (n, (t, _)) in cls._get_storage().items()) @classmethod - def add(cls, name: str) -> Callable[[R], R]: + def add(cls, name: str, desc: Optional[str] = None) -> Callable[[T], T]: """Add a class to the registry.""" - def _add(tagger_self: T, tagger_name: str = name, cls_: Type[BaseRegistry] = cls) -> T: + # Add the registry to the registry of registries + cls._add_to_registry_of_registries() + + def _add( + tagger_self: T, + tagger_name: str = name, + tagger_desc: Optional[str] = desc, + tagger_cls: Type[BaseRegistry] = cls, + ) -> T: """Add a tagger to the registry using tagger_name as the name.""" - if tagger_name in cls_._get_storage() and cls_._get_storage()[tagger_name] != tagger_self: + if tagger_name in tagger_cls._get_storage() and tagger_cls._get_storage()[tagger_name] != tagger_self: if tagger_self.__module__ == "__main__": return tagger_self raise ValueError(f"Tagger {tagger_name} already exists") - cls_._get_storage()[tagger_name] = tagger_self + tagger_cls._get_storage()[tagger_name] = (tagger_self, tagger_desc) return tagger_self return _add # type: ignore @@ -57,7 +77,8 @@ def get(cls, name: str) -> T: if name not in cls._get_storage(): tagger_names = ", ".join([tn for tn, _ in cls.items()]) raise ValueError(f"Unknown tagger {name}; available taggers: {tagger_names}") - return cls._get_storage()[name] + t, _ = cls._get_storage()[name] + return t class TaggerRegistry(BaseRegistry[Type[BaseTagger]]): From 7da8e1f1ce5b75108492d9274623125ac8712618 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Tue, 7 May 2024 17:39:14 +0200 Subject: [PATCH 002/169] better resolvers --- python/dolma/cli/__main__.py | 2 +- python/dolma/cli/listers.py | 2 +- python/dolma/cli/resolvers.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/dolma/cli/__main__.py b/python/dolma/cli/__main__.py index 2cd25923..e196a171 100644 --- a/python/dolma/cli/__main__.py +++ b/python/dolma/cli/__main__.py @@ -9,12 +9,12 @@ from ..core.paths import exists from .analyzer import AnalyzerCli from .deduper import DeduperCli +from .listers import ListerCli from .mixer import MixerCli # must import these to register the resolvers from .resolvers import * # noqa: F401,F403,W0401 from .tagger import TaggerCli -from .listers import ListerCli from .tokenizer import TokenizerCli from .warc import WarcExtractorCli diff --git a/python/dolma/cli/listers.py b/python/dolma/cli/listers.py index ba6fae3e..7d97a980 100644 --- a/python/dolma/cli/listers.py +++ b/python/dolma/cli/listers.py @@ -12,7 +12,7 @@ @dataclass class ListerConfig: - modules : List[str] = field( + modules: List[str] = field( default=[], help="List of Python modules $PYTHONPATH to import custom registry modules from.", ) diff --git a/python/dolma/cli/resolvers.py b/python/dolma/cli/resolvers.py index 9c859052..a5195a84 100644 --- a/python/dolma/cli/resolvers.py +++ b/python/dolma/cli/resolvers.py @@ -8,7 +8,6 @@ from ..core.paths import glob_path from ..core.registry import BaseRegistry - C = TypeVar("C", bound=Callable) @@ -19,13 +18,14 @@ def add(cls, name: str, desc: Optional[str] = None) -> Callable[[C], C]: def _wrapped_add_fn( resolver: C, - base_add_fn: C = _add_fn, # type: ignore + base_add_fn: C = _add_fn, # type: ignore resolver_name: str = name, ) -> C: base_add_fn(resolver) resolver_name = f"d.{resolver_name}" om.register_new_resolver(resolver_name, resolver, replace=True) return resolver + return _wrapped_add_fn From 6859981074ba912c1847999faa60798e090e16a9 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 8 May 2024 10:49:40 +0200 Subject: [PATCH 003/169] better list --- python/dolma/cli/listers.py | 26 ++++++++++++++++++-------- python/dolma/core/registry.py | 5 +++++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/python/dolma/cli/listers.py b/python/dolma/cli/listers.py index 7d97a980..623eb83b 100644 --- a/python/dolma/cli/listers.py +++ b/python/dolma/cli/listers.py @@ -44,17 +44,27 @@ def run(cls, parsed_config: ListerConfig): # import tagger modules import_modules(parsed_config.modules) - for tagger_name, tagger_cls in BaseRegistry.registries(): - if parsed_config.filter is not None and parsed_config.filter.lower() not in tagger_name.lower(): + for reg_item_name, reg_item_cls in BaseRegistry.registries(): + if parsed_config.filter is not None and parsed_config.filter.lower() not in reg_item_name.lower(): continue - table = Table(title=tagger_name, style="bold") - table.add_column("name", justify="left", style="cyan") - table.add_column("class", justify="left", style="magenta") + any_has_description = any( + reg_item_desc for _, _, reg_item_desc in reg_item_cls.items_with_description() + ) + + table = Table(title=reg_item_name, style="bold") + table.width + table.add_column("name", justify="left", style="cyan", no_wrap=True, ratio=1) + table.add_column("class", justify="left", style="magenta", no_wrap=False, ratio=1) + if any_has_description: + table.add_column("description", justify="left", style="blue", no_wrap=False, ratio=4) - for tagger_name, tagger_cls in sorted(tagger_cls.items()): - tagger_repr = f"{tagger_cls.__module__}.{tagger_cls.__name__}" - table.add_row(tagger_name, tagger_repr) + for reg_item_name, reg_item_cls, reg_item_desc in sorted(reg_item_cls.items_with_description()): + registry_module = f"{reg_item_cls.__module__}.{reg_item_cls.__name__}" + if any_has_description: + table.add_row(reg_item_name, registry_module, reg_item_desc) + else: + table.add_row(reg_item_name, registry_module) console = Console() console.print(table) diff --git a/python/dolma/core/registry.py b/python/dolma/core/registry.py index 9d6cc3ea..e0edf754 100644 --- a/python/dolma/core/registry.py +++ b/python/dolma/core/registry.py @@ -34,6 +34,11 @@ def items(cls) -> Generator[Tuple[str, T], None, None]: """Yield all items in the registry.""" yield from sorted((n, t) for (n, (t, _)) in cls._get_storage().items()) + @classmethod + def items_with_description(cls) -> Generator[Tuple[str, T, Optional[str]], None, None]: + """Yield all items in the registry with their descriptions.""" + yield from sorted((n, t, d) for (n, (t, d)) in cls._get_storage().items()) + @classmethod def add(cls, name: str, desc: Optional[str] = None) -> Callable[[T], T]: """Add a class to the registry.""" From 6e8f31d23271522c60c5638f4e73397c9f6c3529 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Wed, 8 May 2024 12:12:22 +0200 Subject: [PATCH 004/169] parsing from stdin --- python/dolma/cli/__init__.py | 1 + python/dolma/cli/analyzer.py | 4 +++- python/dolma/cli/deduper.py | 9 ++++++++- python/dolma/cli/shared.py | 17 ++++++++++++++++- python/dolma/cli/tagger.py | 4 +++- python/dolma/cli/tokenizer.py | 3 ++- python/dolma/cli/warc.py | 4 +++- 7 files changed, 36 insertions(+), 6 deletions(-) diff --git a/python/dolma/cli/__init__.py b/python/dolma/cli/__init__.py index de6349cd..f5e0bd84 100644 --- a/python/dolma/cli/__init__.py +++ b/python/dolma/cli/__init__.py @@ -39,6 +39,7 @@ "make_parser", "namespace_to_nested_omegaconf", "print_config", + "maybe_parse_from_stdin", ] diff --git a/python/dolma/cli/analyzer.py b/python/dolma/cli/analyzer.py index 414615b5..124901f8 100644 --- a/python/dolma/cli/analyzer.py +++ b/python/dolma/cli/analyzer.py @@ -2,7 +2,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs +from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin from dolma.core.analyzer import create_and_run_analyzer from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger @@ -57,6 +57,8 @@ class AnalyzerCli(BaseCli): def run(cls, parsed_config: AnalyzerConfig): logger = get_logger("analyzer") + parsed_config.attributes = maybe_parse_from_stdin(parsed_config.attributes) + # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in parsed_config.attributes: diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index de684200..346e33ff 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -8,7 +8,12 @@ from dolma import deduper from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, get_path_to_temp_file, make_workdirs +from dolma.cli.shared import ( + WorkDirConfig, + get_path_to_temp_file, + make_workdirs, + maybe_parse_from_stdin, +) from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path, is_local @@ -116,6 +121,8 @@ def run(cls, parsed_config: DeduperConfig): dict_config: Dict[str, Any] = {} with ExitStack() as stack: + parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) + work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir)) # create a dedupe config to populate diff --git a/python/dolma/cli/shared.py b/python/dolma/cli/shared.py index fb731641..095d8fcf 100644 --- a/python/dolma/cli/shared.py +++ b/python/dolma/cli/shared.py @@ -1,10 +1,11 @@ import copy import os +import sys import tempfile from contextlib import ExitStack, contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Generator, Optional +from typing import Generator, List, Optional, Union from dolma.cli import field @@ -39,3 +40,17 @@ def make_workdirs(config: WorkDirConfig) -> Generator[WorkDirConfig, None, None] config.output = stack.enter_context(tempfile.TemporaryDirectory(prefix="dolma-output-")) yield config + + +def maybe_parse_from_stdin(paths: Union[str, List[str]]) -> List[str]: + """ + If paths is a single string equal to "-", read from stdin and return a list of lines; + otherwise return the input as is. + """ + if isinstance(paths, str): + paths = [paths] + + if paths == ["-"]: + return [str(r.strip()) for r in sys.stdin] + + return paths[:] diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py index 90b3aa84..0359e00c 100644 --- a/python/dolma/cli/tagger.py +++ b/python/dolma/cli/tagger.py @@ -3,7 +3,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs +from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -100,6 +100,8 @@ def run(cls, parsed_config: TaggerConfig): logger = get_logger("tagger") with make_workdirs(parsed_config.work_dir) as work_dirs: + parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) + documents = [str(p) for p in parsed_config.documents] taggers = [str(p) for p in parsed_config.taggers] diff --git a/python/dolma/cli/tokenizer.py b/python/dolma/cli/tokenizer.py index 6ecce941..61b3a3e5 100644 --- a/python/dolma/cli/tokenizer.py +++ b/python/dolma/cli/tokenizer.py @@ -2,7 +2,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs +from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -151,6 +151,7 @@ def run(cls, parsed_config: TokenizationConfig): logger = get_logger("tagger") with make_workdirs(parsed_config.work_dir) as work_dirs: + parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) documents = [str(p) for p in parsed_config.documents] # perform some path validation to make sure we don't call the mixer with invalid config diff --git a/python/dolma/cli/warc.py b/python/dolma/cli/warc.py index 9a8da2ea..d7fbfba7 100644 --- a/python/dolma/cli/warc.py +++ b/python/dolma/cli/warc.py @@ -2,7 +2,7 @@ from typing import List from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs +from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -76,6 +76,8 @@ def run(cls, parsed_config: WarcExtractorConfig): logger = get_logger("warc") with make_workdirs(parsed_config.work_dir) as work_dirs: + parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) + documents = [str(p) for p in parsed_config.documents] destination = [str(p) for p in parsed_config.destination] From 6395416ddeaf3652c560faa43d07061f6ab7dce2 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 14:22:47 +0000 Subject: [PATCH 005/169] fixed behavior with old omegaconf --- python/dolma/cli/__init__.py | 78 +++++++++++++++++++++++++++++++---- python/dolma/cli/analyzer.py | 4 +- python/dolma/cli/deduper.py | 9 +--- python/dolma/cli/resolvers.py | 23 ++++++++--- python/dolma/cli/tagger.py | 3 +- python/dolma/cli/tokenizer.py | 3 +- python/dolma/cli/warc.py | 34 +++++++-------- 7 files changed, 108 insertions(+), 46 deletions(-) diff --git a/python/dolma/cli/__init__.py b/python/dolma/cli/__init__.py index f5e0bd84..137f3ef4 100644 --- a/python/dolma/cli/__init__.py +++ b/python/dolma/cli/__init__.py @@ -25,6 +25,7 @@ get_origin, ) +from necessary import necessary from omegaconf import MISSING, DictConfig, ListConfig from omegaconf import OmegaConf as om from omegaconf.errors import OmegaConfBaseException @@ -33,14 +34,7 @@ from ..core.errors import DolmaConfigError -__all__ = [ - "BaseCli", - "field", - "make_parser", - "namespace_to_nested_omegaconf", - "print_config", - "maybe_parse_from_stdin", -] +__all__ = ["BaseCli", "field", "make_parser", "namespace_to_nested_omegaconf", "print_config"] T = TypeVar("T", bound=Any) @@ -147,7 +141,13 @@ def namespace_to_nested_omegaconf(args: Namespace, structured: Type[T], config: om.create(config or {}), om.create(nested_config_dict) ) # pyright: ignore (pylance is confused because om.create might return a DictConfig or a ListConfig) + # resolve any interpolations in the config + om.resolve(untyped_config) + + # create structured config from cli dataclass base_structured_config: DictConfig = om.structured(structured) + + # merge with options parsed from config file and merged_config = om.merge(base_structured_config, untyped_config) # check for type @@ -200,3 +200,65 @@ def run_from_args(cls, args: Namespace, config: Optional[dict] = None): @classmethod def run(cls, parsed_config: D): raise NotImplementedError("Abstract method; must be implemented in subclass") + + +def patch_old_omegaconf(): + """Monkey patch omegaconf below version 2.3.0 to support custom resolver returning + lists or dicts. Applies patch https://github.com/omry/omegaconf/pull/1093""" + + if necessary(("omegaconf", "2.4.0"), soft=True): + # no need to patch + return + + if getattr(patch_old_omegaconf, "__patched__", False): + # already patched + return + + from omegaconf import _impl # pylint: disable=import-outside-toplevel + from omegaconf import ( # pylint: disable=import-outside-toplevel + Container, + Node, + ValueNode, + ) + from omegaconf._utils import ( # noqa: F401 # pylint: disable=import-outside-toplevel + _ensure_container, + _get_value, + is_primitive_container, + is_structured_config, + ) + from omegaconf.errors import ( + InterpolationToMissingValueError, # pylint: disable=import-outside-toplevel + ) + from omegaconf.nodes import ( + InterpolationResultNode, # pylint: disable=import-outside-toplevel + ) + + def _resolve_container_value(cfg: Container, key: Any) -> None: + node = cfg._get_child(key) # pylint: disable=protected-access + assert isinstance(node, Node) + if node._is_interpolation(): # pylint: disable=protected-access + try: + resolved = node._dereference_node() # pylint: disable=protected-access + except InterpolationToMissingValueError: + node._set_value(MISSING) # pylint: disable=protected-access + else: + if isinstance(resolved, Container): + _impl._resolve(resolved) # pylint: disable=protected-access + if isinstance(resolved, InterpolationResultNode): + resolved_value = _get_value(resolved) + if is_primitive_container(resolved_value) or is_structured_config(resolved_value): + resolved = _ensure_container(resolved_value) + if isinstance(resolved, Container) and isinstance(node, ValueNode): + cfg[key] = resolved + else: + node._set_value(_get_value(resolved)) # pylint: disable=protected-access + else: + _impl._resolve(node) # pylint: disable=protected-access + + # set new function and mark as patched + setattr(_impl, "_resolve_container_value", _resolve_container_value) + setattr(patch_old_omegaconf, "__patched__", True) + + +# actually executes the patch +patch_old_omegaconf() diff --git a/python/dolma/cli/analyzer.py b/python/dolma/cli/analyzer.py index 124901f8..414615b5 100644 --- a/python/dolma/cli/analyzer.py +++ b/python/dolma/cli/analyzer.py @@ -2,7 +2,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin +from dolma.cli.shared import WorkDirConfig, make_workdirs from dolma.core.analyzer import create_and_run_analyzer from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger @@ -57,8 +57,6 @@ class AnalyzerCli(BaseCli): def run(cls, parsed_config: AnalyzerConfig): logger = get_logger("analyzer") - parsed_config.attributes = maybe_parse_from_stdin(parsed_config.attributes) - # perform some path validation to make sure we don't call the mixer with invalid config total_matching_documents = 0 for document in parsed_config.attributes: diff --git a/python/dolma/cli/deduper.py b/python/dolma/cli/deduper.py index 346e33ff..de684200 100644 --- a/python/dolma/cli/deduper.py +++ b/python/dolma/cli/deduper.py @@ -8,12 +8,7 @@ from dolma import deduper from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import ( - WorkDirConfig, - get_path_to_temp_file, - make_workdirs, - maybe_parse_from_stdin, -) +from dolma.cli.shared import WorkDirConfig, get_path_to_temp_file, make_workdirs from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path, is_local @@ -121,8 +116,6 @@ def run(cls, parsed_config: DeduperConfig): dict_config: Dict[str, Any] = {} with ExitStack() as stack: - parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) - work_dirs = stack.enter_context(make_workdirs(parsed_config.work_dir)) # create a dedupe config to populate diff --git a/python/dolma/cli/resolvers.py b/python/dolma/cli/resolvers.py index a5195a84..0df7a3ec 100644 --- a/python/dolma/cli/resolvers.py +++ b/python/dolma/cli/resolvers.py @@ -2,6 +2,7 @@ import sys from typing import Callable, List, Optional, TypeVar +import smart_open from cached_path import cached_path from omegaconf.omegaconf import OmegaConf as om @@ -22,30 +23,40 @@ def _wrapped_add_fn( resolver_name: str = name, ) -> C: base_add_fn(resolver) - resolver_name = f"d.{resolver_name}" om.register_new_resolver(resolver_name, resolver, replace=True) return resolver return _wrapped_add_fn -@ResolverRegistry.add("cache", "Download a file and replace the path with the cached path.") +@ResolverRegistry.add("d.cache", "Download a file and replace the path with the cached path.") def cache(path: str) -> str: return str(cached_path(path)) -@ResolverRegistry.add("glob", "Glob this path and return a list of files.") +@ResolverRegistry.add("d.glob", "Glob this path and return a list of files.") def glob(path: str) -> List[str]: globbed = list(glob_path(path)) assert len(globbed) > 0, f"Path {path} does not match any files" return globbed -@ResolverRegistry.add("processes", "Return the number of processes available (optionally with buffer).") +@ResolverRegistry.add("d.procs", "Return the number of processes available (optionally with buffer).") def processes(n: int = 0) -> int: return max(1, multiprocessing.cpu_count() - n) -@ResolverRegistry.add("stdin", "Read from stdin and return list of paths.") +@ResolverRegistry.add("d.stdin", "Read from stdin and return list of lines.") def stdin() -> List[str]: - return [line.strip() for line in sys.stdin] + return [stripped_line for line in sys.stdin if (stripped_line := line.strip())] + + +@ResolverRegistry.add("d.file", "Read from a file and return contents.") +def file_(path: str) -> str: + with smart_open.open(path, "rt") as f: + return str(f.read()) + + +@ResolverRegistry.add("d.split", "Split string into list of strings on symbol.") +def split(string: str, symbol: str = "\n") -> List[str]: + return [stripped_line for line in string.split(symbol) if (stripped_line := line.strip())] diff --git a/python/dolma/cli/tagger.py b/python/dolma/cli/tagger.py index 0359e00c..ff56419e 100644 --- a/python/dolma/cli/tagger.py +++ b/python/dolma/cli/tagger.py @@ -3,7 +3,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin +from dolma.cli.shared import WorkDirConfig, make_workdirs from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -100,7 +100,6 @@ def run(cls, parsed_config: TaggerConfig): logger = get_logger("tagger") with make_workdirs(parsed_config.work_dir) as work_dirs: - parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) documents = [str(p) for p in parsed_config.documents] taggers = [str(p) for p in parsed_config.taggers] diff --git a/python/dolma/cli/tokenizer.py b/python/dolma/cli/tokenizer.py index 61b3a3e5..6ecce941 100644 --- a/python/dolma/cli/tokenizer.py +++ b/python/dolma/cli/tokenizer.py @@ -2,7 +2,7 @@ from typing import List, Optional from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin +from dolma.cli.shared import WorkDirConfig, make_workdirs from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -151,7 +151,6 @@ def run(cls, parsed_config: TokenizationConfig): logger = get_logger("tagger") with make_workdirs(parsed_config.work_dir) as work_dirs: - parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) documents = [str(p) for p in parsed_config.documents] # perform some path validation to make sure we don't call the mixer with invalid config diff --git a/python/dolma/cli/warc.py b/python/dolma/cli/warc.py index d7fbfba7..50ca838e 100644 --- a/python/dolma/cli/warc.py +++ b/python/dolma/cli/warc.py @@ -2,7 +2,7 @@ from typing import List from dolma.cli import BaseCli, field, print_config -from dolma.cli.shared import WorkDirConfig, make_workdirs, maybe_parse_from_stdin +from dolma.cli.shared import WorkDirConfig, make_workdirs from dolma.core.errors import DolmaConfigError from dolma.core.loggers import get_logger from dolma.core.paths import glob_path @@ -63,8 +63,9 @@ class WarcExtractorConfig: work_dir: WorkDirConfig = field(default=WorkDirConfig(), help="Configuration for temporary work directories.") dryrun: bool = field( default=False, - help="If true, only print the configuration and exit without running the taggers.", + help="If true, only print the configuration and exit without running the pipieline.", ) + check: bool = field(default=True, help="If true, check if input documents are valid paths before running the") class WarcExtractorCli(BaseCli): @@ -76,8 +77,6 @@ def run(cls, parsed_config: WarcExtractorConfig): logger = get_logger("warc") with make_workdirs(parsed_config.work_dir) as work_dirs: - parsed_config.documents = maybe_parse_from_stdin(parsed_config.documents) - documents = [str(p) for p in parsed_config.documents] destination = [str(p) for p in parsed_config.destination] @@ -85,19 +84,20 @@ def run(cls, parsed_config: WarcExtractorConfig): if not isinstance(source_name, str): raise ValueError(f"source_name must be a string, not {source_name} ({type(source_name)})") - # perform some path validation to make sure we don't call - # the extractor with invalid config - total_matching_documents = 0 - for document in documents: - current_matching_documents = sum(1 for _ in glob_path(document)) - if current_matching_documents == 0: - # only raise a warning if no documents are found for a single path - logger.warning("No documents found for path %s", document) - total_matching_documents += current_matching_documents - - if total_matching_documents == 0: - # but raise an error if no documents are found for all paths - raise DolmaConfigError(f"No documents found for paths {documents}.") + if parsed_config.check: + # perform some path validation to make sure we don't call the warc + # extractor with an invalid config + total_matching_documents = 0 + for document in documents: + current_matching_documents = sum(1 for _ in glob_path(document)) + if current_matching_documents == 0: + # only raise a warning if no documents are found for a single path + logger.warning("No documents found for path %s", document) + total_matching_documents += current_matching_documents + + if total_matching_documents == 0: + # but raise an error if no documents are found for all paths + raise DolmaConfigError(f"No documents found for paths {documents}.") print_config(parsed_config) if parsed_config.dryrun: From f6db05ed434d16a284e735b77da875155d44f662 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 17:37:39 +0000 Subject: [PATCH 006/169] math parsers --- pyproject.toml | 1 + python/dolma/cli/__init__.py | 8 ++++---- python/dolma/core/parallel.py | 26 ++++++++++++++++++++------ python/dolma/taggers/__init__.py | 1 + python/dolma/warc/utils.py | 6 ++++-- 5 files changed, 30 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7be2a77e..5feef1a3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,6 +127,7 @@ warc = [ "fastwarc", "w3lib", "url-normalize", + "dateparser" ] trafilatura = [ diff --git a/python/dolma/cli/__init__.py b/python/dolma/cli/__init__.py index 137f3ef4..5293909c 100644 --- a/python/dolma/cli/__init__.py +++ b/python/dolma/cli/__init__.py @@ -226,11 +226,11 @@ def patch_old_omegaconf(): is_primitive_container, is_structured_config, ) - from omegaconf.errors import ( - InterpolationToMissingValueError, # pylint: disable=import-outside-toplevel + from omegaconf.errors import ( # pylint: disable=import-outside-toplevel + InterpolationToMissingValueError, ) - from omegaconf.nodes import ( - InterpolationResultNode, # pylint: disable=import-outside-toplevel + from omegaconf.nodes import ( # pylint: disable=import-outside-toplevel + InterpolationResultNode, ) def _resolve_container_value(cfg: Container, key: Any) -> None: diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py index 0bbfc75f..6013a378 100644 --- a/python/dolma/core/parallel.py +++ b/python/dolma/core/parallel.py @@ -11,7 +11,7 @@ from functools import partial from queue import Queue from threading import Thread -from typing import Any, Dict, List, NamedTuple, Optional, Tuple, TypeVar, Union +from typing import Any, Dict, List, NamedTuple, Optional, Set, Tuple, TypeVar, Union import smart_open import tqdm @@ -453,6 +453,14 @@ def _valid_path(self, path: str) -> bool: return False return True + def _get_existing_meta(self, *meta_prefixes: str) -> Set[str]: + """Get the existing metadata files for the given prefixes.""" + existing_metadata = set() + for meta_prefix in meta_prefixes: + for path in glob_path(meta_prefix): + existing_metadata.add(re.sub(rf"{METADATA_SUFFIX}$", "", sub_prefix(path, meta_prefix))) + return existing_metadata + def _get_all_paths(self) -> AllPathsTuple: """Get all paths to process using prefixes provided""" all_paths = AllPathsTuple.empty() @@ -478,10 +486,7 @@ def _get_all_paths(self) -> AllPathsTuple: random.shuffle(rel_paths) # get a list of which metadata files already exist - existing_metadata_names = set( - re.sub(rf"{METADATA_SUFFIX}$", "", sub_prefix(path, meta_prefix)) - for path in glob_path(meta_prefix) - ) + existing_metadata_names = self._get_existing_meta(meta_prefix) for path in rel_paths: if not self.ignore_existing and path in existing_metadata_names: @@ -501,14 +506,23 @@ def _get_all_paths(self) -> AllPathsTuple: def __call__(self, **process_single_kwargs: Any): """Run the processor.""" + logger = self.get_logger() + logger.setLevel(logging.INFO) + random.seed(self.seed) # in case the user wants to override the default kwargs for retries process_single_kwargs.setdefault("retries_on_error", self.retries_on_error) all_paths = self._get_all_paths() + logger.info("Found %s files to process", len(all_paths.src)) - print(f"Found {len(all_paths.src):,} files to process") + if len(all_paths.src) == 0: + if len(self._get_existing_meta(*self.meta_prefixes)) > 0: + logger.info("All files already processed; skipping.") + return + else: + raise DolmaError("No files found to process.") fn = self._debug_run_all if self.debug else self._multiprocessing_run_all diff --git a/python/dolma/taggers/__init__.py b/python/dolma/taggers/__init__.py index 15ebaba0..ea63a324 100644 --- a/python/dolma/taggers/__init__.py +++ b/python/dolma/taggers/__init__.py @@ -10,6 +10,7 @@ punctuation, repetitions, sampling, + science, tokenizers, url, ) diff --git a/python/dolma/warc/utils.py b/python/dolma/warc/utils.py index f21d2651..8c550cd1 100644 --- a/python/dolma/warc/utils.py +++ b/python/dolma/warc/utils.py @@ -25,8 +25,10 @@ def raise_warc_dependency_error(package: str): class UrlNormalizer: def __init__(self): - assert URL_NORMALIZE_AVAILABLE, raise_warc_dependency_error("url-normalize") - assert W3LIB_AVAILABLE, raise_warc_dependency_error("w3lib") + if not URL_NORMALIZE_AVAILABLE: + raise_warc_dependency_error("url-normalize") + if not W3LIB_AVAILABLE: + raise_warc_dependency_error("w3lib") self.www_subdomain_regex = re.compile(r"(^(www\d*\.))|(/+$)", re.IGNORECASE) def __call__(self, url: str) -> str: From 8f282cf187ffef04df1bae9a256e21169bc03f73 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 17:37:51 +0000 Subject: [PATCH 007/169] math parsers --- python/dolma/taggers/science.py | 339 ++++++++++++++++++++++++++++++++ 1 file changed, 339 insertions(+) create mode 100644 python/dolma/taggers/science.py diff --git a/python/dolma/taggers/science.py b/python/dolma/taggers/science.py new file mode 100644 index 00000000..1b8f37f3 --- /dev/null +++ b/python/dolma/taggers/science.py @@ -0,0 +1,339 @@ +import re +from typing import Optional + +from ..core.data_types import DocResult, DocumentWithMetadata, Span +from ..core.registry import TaggerRegistry +from ..core.taggers import BaseTagger + +MATH_KEYWORDS = [ + "MathJax", + "mathjax", + " DocResult: # type: ignore + html: Optional[str] = doc.metadata.get("html", None) + if html is None: + raise ValueError("Cannot find `html` key in metadata.") + + if match := self.expr.search(html): + start, end = match.span() + spans = [Span(start=start, end=end, type="math", score=end - start)] + else: + spans = [] + + return DocResult(doc=doc, spans=spans) + + +@TaggerRegistry.add("owm_latex_v1") +class OpenWebMathContainsLatexTagger(BaseTagger): + def __init__(self): + self.expr = re.compile("|".join(LATEX_MATH_COMMANDS)) + + def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore + html: Optional[str] = doc.metadata.get("html", None) + if html is None: + raise ValueError("Cannot find `html` key in metadata.") + + if ("\\\\" in html) and (match := self.expr.search(html)): + start, end = match.span() + spans = [Span(start=start, end=end, type="latex", score=end - start)] + else: + spans = [] + + return DocResult(doc=doc, spans=spans) + + +@TaggerRegistry.add("science_kw_v1") +class ScienceKeywordsTagger(BaseTagger): + def __init__(self): + self.expr = re.compile("|".join(SCIENCE_KEYWORDS)) + + def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore + text: Optional[str] = doc.metadata.get("html", None) + if text is None: + raise ValueError("Cannot find `html` key in metadata.") + + if match := self.expr.search(text): + start, end = match.span() + spans = [Span(start=start, end=end, type="science", score=end - start)] + else: + spans = [] + + return DocResult(doc=doc, spans=spans) From acf4ce55766692bb89a0220da28b56dd8ec317fd Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 19:20:52 +0000 Subject: [PATCH 008/169] science --- configs/crawl/cccc.yaml | 16 + configs/crawl/science.yaml | 23 + pyproject.toml | 11 +- python/dolma/taggers/language.py | 11 + python/dolma/taggers/science.py | 527 ++++---- python/dolma/warc/linearizers.py | 11 + python/dolma/warc/openwebmath/__init__.py | 27 + python/dolma/warc/openwebmath/config.py | 41 + python/dolma/warc/openwebmath/constants.py | 283 +++++ python/dolma/warc/openwebmath/extract.py | 155 +++ .../warc/openwebmath/latex_processing.py | 778 ++++++++++++ .../dolma/warc/openwebmath/line_processing.py | 82 ++ python/dolma/warc/openwebmath/mmltex/README | 97 ++ .../dolma/warc/openwebmath/mmltex/cmarkup.xsl | 1093 +++++++++++++++++ .../warc/openwebmath/mmltex/entities.xsl | 316 +++++ .../dolma/warc/openwebmath/mmltex/glayout.xsl | 220 ++++ .../dolma/warc/openwebmath/mmltex/mmltex.xsl | 45 + .../dolma/warc/openwebmath/mmltex/scripts.xsl | 292 +++++ .../dolma/warc/openwebmath/mmltex/tables.xsl | 130 ++ .../dolma/warc/openwebmath/mmltex/tokens.xsl | 296 +++++ .../dolma/warc/openwebmath/tree_processing.py | 400 ++++++ python/dolma/warc/openwebmath/utils.py | 105 ++ 22 files changed, 4640 insertions(+), 319 deletions(-) create mode 100644 configs/crawl/cccc.yaml create mode 100644 configs/crawl/science.yaml create mode 100644 python/dolma/warc/openwebmath/__init__.py create mode 100644 python/dolma/warc/openwebmath/config.py create mode 100644 python/dolma/warc/openwebmath/constants.py create mode 100644 python/dolma/warc/openwebmath/extract.py create mode 100644 python/dolma/warc/openwebmath/latex_processing.py create mode 100644 python/dolma/warc/openwebmath/line_processing.py create mode 100644 python/dolma/warc/openwebmath/mmltex/README create mode 100644 python/dolma/warc/openwebmath/mmltex/cmarkup.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/entities.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/glayout.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/mmltex.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/scripts.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/tables.xsl create mode 100644 python/dolma/warc/openwebmath/mmltex/tokens.xsl create mode 100644 python/dolma/warc/openwebmath/tree_processing.py create mode 100644 python/dolma/warc/openwebmath/utils.py diff --git a/configs/crawl/cccc.yaml b/configs/crawl/cccc.yaml new file mode 100644 index 00000000..586c5b37 --- /dev/null +++ b/configs/crawl/cccc.yaml @@ -0,0 +1,16 @@ +documents: ${d.stdin:} +destination: + - ${oc.env:HOME}/cccc/CC-MAIN-2024-18/documents +processes: ${d.procs:} +source_name: cccc_CC-MAIN-2024-18 +linearizer: resiliparse +pre: + taggers: + - cc_re + skip: true + +store_html_in_metadata: true + +work_dir: + input: /tmp/cccc/CC-MAIN-2024-18/input + output: /tmp/cccc/CC-MAIN-2024-18/output diff --git a/configs/crawl/science.yaml b/configs/crawl/science.yaml new file mode 100644 index 00000000..bec9f948 --- /dev/null +++ b/configs/crawl/science.yaml @@ -0,0 +1,23 @@ +documents: ${d.stdin:} +destination: + - ${oc.env:HOME}/science/CC-MAIN-2024-18/documents +processes: ${d.procs:} +source_name: cccc_CC-MAIN-2024-18 +linearizer: openwebmath +pre: + taggers: + - owm_math_v1 + - owm_latex_v1 + - science_kw_v1 + skip: true + +post: + taggers: + - ft_dolma_doc_eng + skip: true + +store_html_in_metadata: false + +work_dir: + input: /tmp/science/CC-MAIN-2024-18/input + output: /tmp/science/CC-MAIN-2024-18/output diff --git a/pyproject.toml b/pyproject.toml index 5feef1a3..e85995e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,6 +18,7 @@ dependencies = [ "omegaconf>=2.3.0", # "pycld2==0.41", # "pycld3==0.22", # does not install correctly + "acora>=2.4", "platformdirs>=4.2.0", "pyyaml", "requests", @@ -148,6 +149,14 @@ resiliparse = [ "resiliparse", ] +openwebmath = [ + "dolma[warc]", + "resiliparse", + "tabulate", + "py_asciimath", + "lxml" +] + # all extensions all = [ "dolma[dev]", @@ -176,7 +185,7 @@ features = ["pyo3/extension-module"] where = ["src"] [tool.setuptools.package-data] -dolma = ["py.typed", "data/*"] +dolma = ["py.typed", "data/*", "warc/openwebmath/mmltex/*"] [tool.black] line-length = 115 diff --git a/python/dolma/taggers/language.py b/python/dolma/taggers/language.py index 66cb4c6d..2dab5390 100644 --- a/python/dolma/taggers/language.py +++ b/python/dolma/taggers/language.py @@ -182,6 +182,17 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]: return filtered_preds # pyright: ignore +@TaggerRegistry.add("ft_dolma_doc_eng") +class FastTextEnglishDolmaTagger(FastTextEnglishLanguageDocumentTagger): + INCLUDE_NEGATIVE = True + PREDICT_ON_PARAGRAPHS = False + + def predict_text(self, text: str) -> List[Tuple[str, float]]: + preds = super().predict_text(text) + filtered_preds = [(lang, score) for lang, score in preds if lang == "en" and score > 0.5] + return filtered_preds # pyright: ignore + + @TaggerRegistry.add("ft_lang_id_en_only_v2") class FastTextEnglishOnlyLanguageDocumentTagger(FastTextEnglishLanguageDocumentTagger): INCLUDE_NEGATIVE = False diff --git a/python/dolma/taggers/science.py b/python/dolma/taggers/science.py index 1b8f37f3..c8dd075b 100644 --- a/python/dolma/taggers/science.py +++ b/python/dolma/taggers/science.py @@ -1,339 +1,230 @@ -import re -from typing import Optional +from typing import List, Optional + +from acora import AcoraBuilder from ..core.data_types import DocResult, DocumentWithMetadata, Span from ..core.registry import TaggerRegistry -from ..core.taggers import BaseTagger - -MATH_KEYWORDS = [ - "MathJax", - "mathjax", - " DocResult: # type: ignore + def _get_content(self, doc: DocumentWithMetadata) -> str: html: Optional[str] = doc.metadata.get("html", None) if html is None: raise ValueError("Cannot find `html` key in metadata.") + return html - if match := self.expr.search(html): - start, end = match.span() - spans = [Span(start=start, end=end, type="math", score=end - start)] - else: - spans = [] + def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore + content = self._get_content(doc) + spans = [ + Span( + start=(start := match[1]), + end=(end := match[1] + len(match[0])), + type=self.TYPE, + score=(end - start), + ) + for match in self.acora.finditer(content) + ] return DocResult(doc=doc, spans=spans) -@TaggerRegistry.add("owm_latex_v1") -class OpenWebMathContainsLatexTagger(BaseTagger): - def __init__(self): - self.expr = re.compile("|".join(LATEX_MATH_COMMANDS)) - - def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore - html: Optional[str] = doc.metadata.get("html", None) - if html is None: - raise ValueError("Cannot find `html` key in metadata.") +@TaggerRegistry.add("owm_math_v1") +class OpenWebMathContainsMathTagger(BaseHTMLKeywordLookupTagger): + TYPE = "math" + KEYWORDS = [ + "MathJax", + "mathjax", + " DocResult: # type: ignore - text: Optional[str] = doc.metadata.get("html", None) - if text is None: - raise ValueError("Cannot find `html` key in metadata.") - - if match := self.expr.search(text): - start, end = match.span() - spans = [Span(start=start, end=end, type="science", score=end - start)] - else: - spans = [] - - return DocResult(doc=doc, spans=spans) +class ScienceKeywordsTagger(BaseHTMLKeywordLookupTagger): + TYPE = "science" + KEYWORDS = [ + "bmatrix", + "theorem", + "orbitals", + "equations", + "electrons", + "equation", + "hypothesis", + "equilibrium", + "probability", + "deviation", + "atoms", + "molecules", + "theory", + "acceleration", + "molecule", + "hydrogen", + "molecular", + "thesis", + "proportion", + "simplify", + "velocity", + "momentum", + "concentration", + "compounds", + "voltage", + "magnetic", + "definition", + "compound", + "particle", + "vector", + "population", + "determine", + "forces", + "acids", + "study", + "exercises", + "circuit", + "bonds", + "variables", + "temperature", + "oxygen", + "exercise", + "physical", + "angular", + "frequency", + "chemical", + "appendix", + "pressure", + "atmosphere", + "reaction", + "sediment", + "distance", + "waves", + "surface", + "reactions", + "computer", + "learning", + "located", + "electron", + "levels", + "wave", + "carbon", + "earthquake", + "bond", + "protein", + "earth", + "soil", + "income", + "disease", + "tissue", + "blood", + "patient", + "climate", + "muscle", + "financial", + "acid", + "minerals", + "rocks", + ] diff --git a/python/dolma/warc/linearizers.py b/python/dolma/warc/linearizers.py index a99c0775..7e73384c 100644 --- a/python/dolma/warc/linearizers.py +++ b/python/dolma/warc/linearizers.py @@ -5,6 +5,7 @@ from necessary import necessary from ..core.registry import BaseRegistry +from .openwebmath import Extractor from .utils import raise_warc_dependency_error with necessary("trafilatura", soft=True) as TRAFILATURA_AVAILABLE: @@ -143,3 +144,13 @@ def linearize(self, content: Union[str, bytes]) -> str: ) self._flush() return output or "" + + +@LinearizerRegistry.add("openwebmath") +class OpenWebMathExtractor(BaseLinearizer): + def __init__(self) -> None: + self.extractor = Extractor() + + def linearize(self, content: Union[str, bytes]) -> str: + output = self.extractor.extract_text(str(content)) + return output or "" diff --git a/python/dolma/warc/openwebmath/__init__.py b/python/dolma/warc/openwebmath/__init__.py new file mode 100644 index 00000000..68a5cf7e --- /dev/null +++ b/python/dolma/warc/openwebmath/__init__.py @@ -0,0 +1,27 @@ +from functools import partial +from typing import Optional + +from necessary import necessary +from omegaconf import OmegaConf as om + +from .config import OpenWebMathConfig + +__all__ = ["Extractor", "OpenWebMathConfig"] + + +class Extractor: + def __init__(self, config: Optional[OpenWebMathConfig] = None): + necessary("tabulate", message="{module_name} not available; please install dolma[openwebmath]") + necessary("py_asciimath", message="{module_name} not available; please install dolma[openwebmath]") + necessary("lxml", message="{module_name} not available; please install dolma[openwebmath]") + from .extract import extract_text + + parsed_config = om.to_object(config or om.structured(OpenWebMathConfig)) + self._extract_fn = partial(extract_text, config=parsed_config, fast=False) + + def extract_text(self, html: str) -> str: + out = self._extract_fn(html) + if isinstance(out, tuple): + return str(out[0]) + + return "" diff --git a/python/dolma/warc/openwebmath/config.py b/python/dolma/warc/openwebmath/config.py new file mode 100644 index 00000000..97a8712c --- /dev/null +++ b/python/dolma/warc/openwebmath/config.py @@ -0,0 +1,41 @@ +from dataclasses import dataclass, field +from typing import List, NamedTuple + + +class ManagerTuple(NamedTuple): + score: float + activate: bool + + +class ThresholdTuple(NamedTuple): + lo: float + hi: float + + +@dataclass +class BoilerPlateConfig: + ratio_threshold: List[ThresholdTuple] = [ThresholdTuple(0.9, 0.18), ThresholdTuple(0.1, 0.30)] + absolute_threshold: List[ThresholdTuple] = [ThresholdTuple(0.9, 10), ThresholdTuple(0.1, 20)] + end_threshold: List[ThresholdTuple] = [ThresholdTuple(0.95, 15), ThresholdTuple(0.05, 5)] + enable: List[ManagerTuple] = [ManagerTuple(0.95, True), ManagerTuple(0.05, False)] + + +@dataclass +class TableConfig: + min_rows: int = 2 + min_cols: int = 3 + format: str = "plain" + + +@dataclass +class OpenWebMathConfig: + markdown_headings: List[ManagerTuple] = [ManagerTuple(0.9, True), ManagerTuple(0.1, False)] + markdown_code: List[ManagerTuple] = [ManagerTuple(0.95, True), ManagerTuple(0.05, False)] + boilerplate_config: BoilerPlateConfig = field(default_factory=BoilerPlateConfig) + remove_buttons: bool = True + remove_image_figures: bool = True + remove_link_clusters: bool = True + table_config: TableConfig = field(default_factory=TableConfig) + remove_chinese: bool = True + remove_edit_buttons: bool = True + extract_latex: bool = True diff --git a/python/dolma/warc/openwebmath/constants.py b/python/dolma/warc/openwebmath/constants.py new file mode 100644 index 00000000..b58fb8d6 --- /dev/null +++ b/python/dolma/warc/openwebmath/constants.py @@ -0,0 +1,283 @@ +BANNED_SELECTORS = [ + ".breadcrumb", + "#popup", + "#flyout", + "#site-slogan", + "#site-name", + "#menu", + ".nav", + ".login", + ".dropdown", + ".dropdown-menu", + "#login", + ".vote", + ".form-item", + ".user_pic_popup", + "#post-editor", + ".post-form", + ".bottom-notice", + "#sidebar", + "#copyright", + "#footer", + ".footer", + ".site-navigation", + ".popupgroup", + ".posthead", + ".signaturecontainer", + ".after_content", + ".userinfo", + "#similar_threads", + ".toplinks", + ".user-info", + ".post-header", + ".widget_archive", + ".widget_categories", + ".widget_meta", + ".widget_recent_entries", + ".widget_rss", + ".wp_widget_tag_cloud", + ".widget_calendar", + ".navbox", + "#mw-hidden-catlinks", + ".above_postlist", + "#navigation", + ".threadtools", + ".socialbuttons", + "#respond", + ".menu", + ".WikiaHeader", + ".buttons", + "#WikiaRecentActivity", + "#WikiaRandomWiki", + ".loggedout-follow-normal", + "#blurb", + "#banner-top", + ".topbar", + ".topbar-dialog", + ".related-links", + ".votecell", + ".comment-actions", + ".d-none", + ".Tooltip", + ".Notices", + ".likes-other-gravatars", + "#logo_and_banner", + "#pmmcrumb2", + ".qa-notice", + ".qa-nav-user", + ".trackbacks", + "#further_reading", + ".topbar-links", + "#your-communities-section", + ".links-container", + "#herobox", + ".qa-voting-container", + ".qa-post-when-container", + ".qa-q-view-who", + ".qa-q-item-meta", + ".post-menu", + "#vbseo-likes", + "#side_one", + "#side_two", + "#feed_bar", + ".author", + "#likes-other-gravatars", + ".pageInfo", + ".ka-video-player", + ".mw-editsection", + ".mw-ui-icon", + "#mw-revision-info", + "#siteSub", + ".heading--main", + "#loginBarHandle", + ".medalsrest", + ".diff-otitle", + ".diff-ntitle", + ".diff-currentversion-title", + ".diff-contentalign-left", + '[class*="promo"]', + '[class*="button"]', + '[class*="upsell"]', + ".expert-reply-overlay", + ".PreviewContents", + ".solutionHeader__isbn", + ".cta", + ".update-header", + ".best-answer-selected", + ".medal-info", + "#profile-tooltip", + ".update-info", + ".google-search-openstudy", + ".attachments", + "button", + ".delete", + ".editor-actions", + ".editor", + ".files-attached", + ".call-to-action", + ".group-info", + ".top-online-users", + ".message-userExtras", + ".message-attribution-opposite", + ".u-srOnly", + ".block--similarContents", + ".u-concealed", + ".similarThreads", + ".breadcrumbs", + ".courseHeader", + ".impactSection", + ".creativeCommons", + "#clear", +] + +BOILERPLATE_WORDS = [ + "©", + "updates", + "join our", + "buy", + "sign up", + "no results", + "search images", + "all rights reserved", + "was this", + "please", + "visit our", + "download for free", + "retrieved from", + "home page", + "jump to", + "notification switch", + "your email address", + "view answer", + "no label found", + "is licensed under", + "regular updates", + "copyright", + "have access to this article", + "youtube", + "advertisment", + "password", + "login", + "learn more", + "cookie", + "jump to navigation", + "download", + "table of contents", + "leave a reply", + "leave a message", + "skip to", + "stay updated", + "contact us", + "twitter", + "from wikibooks, open books for an open world", + "last modified", + "from wikipedia, the free encyclopedia", + "more info", + "terms of use", + "terms of service", + "privacy policy", + "navigation", + "sign in", + "report error", + "newest", + "under license", + "follow ", + "newer", + "notification", + "post a comment", + "click here", + "leave a comment", + "google", + "free account", + "for free", + "alert", + "receive update", + "share this", + "report ad", + "more posts", + "date of creation", + "link", + "powered by", + "receive", + "newsletter", + "pdf version", + "ask", + "your ", + "facebook", + "jump to search", + "required fields", + "back to top", + "published by", + "pdf article", + "accessload", + "start with", + "loading", + "username", + "helpful", + "log in", + "license", + "get the best", + "join us", + "full article", + "attribution", + "main content", + "printed from", + "distributed under", + "rss", + "24/7", + "your service", + "please contact", + "captcha", + "might be incomplete", + "about this", + "lifetime", + "access to", + "this article is", + "not found", + "show more", + "about", + "business", + "interested in joining", + "wikipedia page", + "gift", + "premium", + "purchase this", + "purchasing", + "access denied", + "wims", + "latest version", + "this page", + "your web browser", + "recent version", + "this article", + "please help", + "help you", + "discard", + "view tag cloud", + "reply", + "sponsor", + "return to", + "physicsoverflow is an open platform for community peer review", + "comments", + "trackback", + "show menu", + "add comment", + "printable view", + "advertisement", + "join now", + "from proofwiki", + "energy points", + "at the top of this image page", + "all products", + "maplesim", + "online help", + "see also", + "all lesson plans", + "menu", + "check out more articles", + "ad", + "votes", + "answer", + "question you clicked on", + "this question is closed", +] diff --git a/python/dolma/warc/openwebmath/extract.py b/python/dolma/warc/openwebmath/extract.py new file mode 100644 index 00000000..3e2e1c2d --- /dev/null +++ b/python/dolma/warc/openwebmath/extract.py @@ -0,0 +1,155 @@ +import re + +from resiliparse.extract.html2text import extract_plain_text +from resiliparse.parse.html import HTMLTree + +from .constants import BANNED_SELECTORS +from .latex_processing import ( + extract_delimited_math, + extract_math, + get_math_config, + replace_math_tags_with_dollar_signs, +) +from .line_processing import ( + remove_boilerplate, + remove_chinese_characters, + remove_edit_buttons, + remove_empty_headers, +) +from .tree_processing import ( + add_se_separators, + extract_code, + extract_headings, + extract_tables, + main_content_preprocess, + post_process_headings, + remove_buttons, + remove_dense_links, + remove_display_none, + remove_image_figures, + wikipedia_preprocess, +) +from .utils import ReplacementManager + + +def filter_tree(tree, replacement_manager, config): + """Filters the HTML tree to remove unwanted elements.""" + + # Remove display none elements + remove_display_none(tree) + + # Remove the wikipedia footer + wikipedia_preprocess(tree) + + if config["remove_buttons"]: + # Remove any bootstrap buttons + remove_buttons(tree) + + if config["remove_image_figures"]: + # Remove any figures that only contain images + remove_image_figures(tree) + + if config["markdown_code"]: + # Wrap the code in markdown code blocks + extract_code(tree, replacement_manager) + + # Record the location of headings and format them + extract_headings(tree, replacement_manager, config["markdown_headings"]) + + # Remove link lists + remove_dense_links(tree) + + # Format tables + extract_tables(tree.document, replacement_manager, config["table_config"]) + + # Process stack exchange separators + add_se_separators(tree) + + # Preprocess main content + main_content_preprocess(tree) + + return tree + + +def html_preprocessing(html): + html = html.replace("<math>", "[itex]") + html = html.replace("</math>", "[/itex]") + return html + + +def replace_tags(html, old, new): + pattern = re.compile(old, re.IGNORECASE) + return pattern.sub(new, html) + + +def extract_text(html, config, fast=False): + """Extracts plain text from an HTML string.""" + html = replace_tags(html, "", "") + html = html_preprocessing(html) + tree = HTMLTree.parse(html) + replacement_manager = ReplacementManager() + + if fast: + links = tree.document.query_selector_all("a") + span_links = tree.document.query_selector_all("span a") + if len(links) > 3000 or len(span_links) > 3000: + print("Too many links, skipping") + return None + + if config["extract_latex"]: + math_config = get_math_config(tree.document.html) + tree, info = extract_math(tree, replacement_manager) + else: + info = {} + tree = filter_tree(tree, replacement_manager, config) + + # Disable their filters because we use our own. + text = extract_plain_text(tree, main_content=True, alt_texts=False, skip_elements=BANNED_SELECTORS) + + if config["extract_latex"]: + text = extract_delimited_math(text, math_config, info, replacement_manager) + + text = post_process_headings(text) + + lines = text.split("\n") + + if config["remove_chinese"]: + # Remove Chinese characters + lines = remove_chinese_characters(lines) + + if config["boilerplate_config"]["enable"]: + # Remove boilerplate + lines = remove_boilerplate(lines, config["boilerplate_config"], replacement_manager) + + # Remove headings with nothing (or only other headings) after + lines = remove_empty_headers(lines, replacement_manager) + + # Strip lines + lines = [line.strip() for line in lines] + + # Create the final string + text = "\n".join(lines) + + # Escape any dollar signs in the text + text = text.replace("$", "\\$") + + # Now, add the dollar signs for math + text = replace_math_tags_with_dollar_signs(text) + + if config["remove_edit_buttons"]: + # Remove edit buttons + lines = text.split("\n") + lines = remove_edit_buttons(lines) + text = "\n".join(lines) + + # If there are over two newlines in a row, replace with two + text = re.sub(r"\n{3,}", "\n\n", text) + + text = replacement_manager.remove_tags(text) + + text = text.strip() + + return text, info diff --git a/python/dolma/warc/openwebmath/latex_processing.py b/python/dolma/warc/openwebmath/latex_processing.py new file mode 100644 index 00000000..afd048ce --- /dev/null +++ b/python/dolma/warc/openwebmath/latex_processing.py @@ -0,0 +1,778 @@ +import html +import json +import logging +import os +import re +from urllib.parse import unquote + +from lxml import etree as ET +from py_asciimath.translator.translator import ASCIIMath2Tex +from resiliparse.parse.html import traverse_dom + +logging.getLogger().setLevel(logging.ERROR) + +color_regex = re.compile(r"\\textcolor\[.*?\]\{.*?\}") + +asciimath2tex = ASCIIMath2Tex(log=False) + +PARAGRAPH_TAGS = frozenset( + { + "body", + "blockquote", + "caption", + "center", + "col", + "colgroup", + "dd", + "div", + "dl", + "dt", + "fieldset", + "form", + "legend", + "optgroup", + "option", + "p", + "pre", + "table", + "td", + "textarea", + "tfoot", + "th", + "thead", + "tr", + "ul", + "li", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + } +) + +latex_math_commands = [ + "\\end", + "\\begin", + "\\ref", + "\\frac", + "\\label", + "\\bf", + "\\right", + "\\left", + "\\rm", + "\\alpha", + "\\mu", + "\\def", + "\\it", + "\\pi", + "\\sigma", + "\\sum", + "\\lambda", + "\\beta", + "\\nu", + "\\partial", + "\\int", + "\\delta", + "\\rho", + "\\phi", + "\\gamma", + "\\omega", + "\\over", + "\\nonumber", + "\\bar", + "\\sqrt", + "\\theta", + "\\tau", + "\\em", + "\\rangle", + "\\hat", + "\\tilde", + "\\cal", + "\\hline", + "\\item", + "\\psi", + "\\vec", + "\\langle", + "\\epsilon", + "\\eta", + "\\cdot", + "\\in", + "\\xi", + "\\infty", + "\\quad", + "\\mathcal", + "\\times", + "\\emph", + "\\mathbf", + "\\prime", + "\\be", + "\\mathrm", + "\\ee", + "\\vspace", + "\\pm", + "\\chi", + "\\ell", + "\\text", + "\\qquad", + "\\noindent", + "\\to", + "\\varphi", + "\\hspace", + "\\leq", + "\\cos", + "\\eqref", + "\\overline", + "\\sin", + "\\kappa", + "\\hbox", + "\\rightarrow", + "\\varepsilon", + "\\textit", + "\\dagger", + "\\big", + "\\otimes", + "\\equiv", + "\\zeta", + "\\dot", + "\\ln", +] + +latex_image_class_names = [ + "latexcenter", + "latex", + "tex", + "latexdisplay", + "latexblock", + "latexblockcenter", +] + + +latex_math_commands = [re.escape(term) for term in latex_math_commands] +latex_math_commands = [x + "(?![a-zA-Z])" for x in latex_math_commands] +latex_regex = re.compile("|".join(latex_math_commands)) + + +def extract_asciimath(s): + parsed = asciimath2tex.translate(s) + return parsed + + +cur_file = os.path.abspath(__file__) +xsl_path = os.path.join(os.path.dirname(cur_file), "mmltex/mmltex.xsl") + +xslt = ET.parse(xsl_path) +transform = ET.XSLT(xslt) + + +def mml_to_latex(mml_code): + # Remove any attibutes from the math tag + mml_code = re.sub(r"()", r"\1", mml_code) + mml_ns = mml_code.replace("", '') # Required. + mml_dom = ET.fromstring(mml_ns) + mmldom = transform(mml_dom) + latex_code = str(mmldom) + return latex_code + + +def wrap_math(s, display=False): + s = re.sub(r"\s+", " ", s) + s = color_regex.sub("", s) + s = s.replace("$", "") + s = s.replace("\n", " ") + s = s.strip() + if len(s) == 0: + return s + # Don't wrap if it's already in \align + if "align" in s: + return s + if display: + return "[extract_tex]" + s + "[/extract_tex]" + return "[extract_itex]" + s + "[/extract_itex]" + + +def get_math_config(html): + has_mathjax = re.search(r"mathjax", html.lower()) + has_katex = re.search(r"katex", html.lower()) + has_latex_math_command = latex_regex.search(html) + if not has_mathjax and not has_katex and not has_latex_math_command: + return None + # Get LaTeX config for MathJax + regex = r"tex2jax: {[^}]*}" + latex_config = { + "inlineMath": [ + ["$", "$"], + ["\[", "\]"], + ["[itex]", "[/itex]"], + ["[math]", "[/math]"], + ["[latex]", "[/latex]"], + ["[texx]", "[/texx]"], + ], + "displayMath": [["\(", "\)"], ["$$", "$$"], ["[tex]", "[/tex]"]], + "skipTags": ["script", "noscript", "style", "textarea", "pre", "code"], + "ignoreClass": "tex2jax_ignore", + } + try: + match = re.search(regex, html) + if match: + config = match.group(0) + # Make it a valid json object by adding quotes around the keys + config = re.sub(r"(\w+):", r'"\1":', config) + config = "{" + config + "}" + # config = re.sub(r"\\", r"\\\\", config) + config = re.sub(r"'", r'"', config) + config = re.sub(r",\s*}", "}", config) + extracted_latex_config = json.loads(config)["tex2jax"] + # latex_config.update(extracted_latex_config) + # Update this in a smart way: if the key is already there, append the values + # if the key is not there, add it + + for key in extracted_latex_config: + if key in latex_config and key != "ignoreClass": + latex_config[key] += extracted_latex_config[key] + else: + latex_config[key] = extracted_latex_config[key] + except Exception as e: + pass + + # Get LaTeX config for KaTeX + """ delimiters: [ + {left: '$$', right: '$$', display: true} + ], + """ + regex = r"delimiters: \[[^\]]*\]" + try: + match = re.search(regex, html) + if match: + config = match.group(0) + # Make it a valid json object by adding quotes around the keys + config = re.sub(r"(\w+):", r'"\1":', config) + # The match is a list without the [] around it. Wrap with {"delimiters": ...} + config = "{" + config + "}" + config = re.sub(r"'", r'"', config) + config = re.sub(r",\s*}", "}", config) + extracted_latex_config = json.loads(config)["delimiters"] + for delimiter in extracted_latex_config: + if delimiter["display"]: + latex_config["displayMath"].append([delimiter["left"], delimiter["right"]]) + else: + latex_config["inlineMath"].append([delimiter["left"], delimiter["right"]]) + except Exception as e: + pass + + # Get AsciiMath config + regex = r"asciimath2jax: {[^}]*}" + asciimath_config = { + "delimiters": [["`", "`"]], + "skipTags": ["script", "noscript", "style", "textarea", "pre", "code"], + "ignoreClass": "asciimath2jax_ignore", + } + try: + match = re.search(regex, html) + if match: + config = match.group(0) + # Make it a valid json object by adding quotes around the keys + config = re.sub(r"(\w+):", r'"\1":', config) + config = "{" + config + "}" + # config = re.sub(r"\\", r"\\\\", config) + config = re.sub(r"'", r'"', config) + config = re.sub(r",\s*}", "}", config) + extracted_asciimath_config = json.loads(config)["asciimath2jax"] + asciimath_config.update(extracted_asciimath_config) + except Exception as e: + pass + return {"latex": latex_config, "asciimath": asciimath_config} + + +def html_unescape(s): + return html.unescape(s) + + +def replace_math_tags_with_dollar_signs(text): + # Replace each of these in the proper way + # itex -> $...$ + # tex -> $$...$$ + # asciimath -> ... + + # Instead of this, simply replace extract_itex with $ and extract_tex with $$. + text = re.sub(r"\[extract_itex\]", "$", text) + text = re.sub(r"\[/extract_itex\]", "$", text) + text = re.sub(r"\[extract_tex\]", "$$", text) + text = re.sub(r"\[/extract_tex\]", "$$", text) + + return text + + +def update_text_with_delimiters(text, delimiters, replacement_manager, info): + + def replace_itex(match): + wrapped = wrap_math(match.group(1)) + tagged = replacement_manager.add_replacement(wrapped, tag="math") + return tagged + + def replace_tex(match): + wrapped = wrap_math(match.group(1), display=True) + tagged = replacement_manager.add_replacement(wrapped, tag="math") + return tagged + + def replace_asciimath(match): + wrapped = match.group(1) + tagged = replacement_manager.add_replacement(wrapped, tag="math") + return tagged + + for delimiter, type in delimiters: + start_delimiter = re.escape(delimiter[0]) + end_delimiter = re.escape(delimiter[1]) + regex = f"{start_delimiter}(.*?){end_delimiter}" + if type == "INLINE_LATEX": + # Simply replace the delimiters with [itex] and [/itex] + updated_text = re.sub(regex, replace_itex, text, flags=re.DOTALL) + if updated_text != text: + info["found_math"] = True + info["mathjax_inline_tex"] += 1 + text = updated_text + elif type == "DISPLAY_LATEX": + updated_text = re.sub(regex, replace_tex, text, flags=re.DOTALL) + if updated_text != text: + info["found_math"] = True + info["mathjax_display_tex"] += 1 + text = updated_text + elif type == "ASCIIMATH": + updated_text = re.sub(regex, replace_asciimath, text, flags=re.DOTALL) + if updated_text != text: + info["found_math"] = True + info["mathjax_asciimath"] += 1 + text = updated_text + + return text + + +def extract_delimited_math(text, mathjax_config, info, replacement_manager): + """This operates on plain text and extracts LaTeX and AsciiMath""" + # import pdb; pdb.set_trace() + if mathjax_config is None: + return text + delimiters = [] + for delimiter in mathjax_config["latex"]["inlineMath"]: + delimiters.append((delimiter, "INLINE_LATEX")) + for delimiter in mathjax_config["latex"]["displayMath"]: + delimiters.append((delimiter, "DISPLAY_LATEX")) + for delimiter in mathjax_config["asciimath"]["delimiters"]: + delimiters.append((delimiter, "ASCIIMATH")) + + delimiters = sorted(delimiters, key=lambda x: len(x[0][0]), reverse=True) + text = update_text_with_delimiters(text, delimiters, replacement_manager, info) + return text + + +def extract_math(tree, replacement_manager): + """Webpages often contain LaTeX or AsciiMath equations that are + hidden within the HTML. This function extracts the LaTeX and + AsciiMath equations from the HTML. + """ + + info = { + "found_math": False, + "script_math_tex": 0, + "script_math_asciimath": 0, + "math_annotations": 0, + "math_alttext": 0, + "mathml": 0, + "mathjax_tag": 0, + "mathjax_inline_tex": 0, + "mathjax_display_tex": 0, + "mathjax_asciimath": 0, + "img_math": 0, + "codecogs_latex": 0, + "wp_latex": 0, + "mimetex.cgi": 0, + "/images/math/codecogs": 0, + "mathtex.cgi": 0, + "katex": 0, + "math-container": 0, + "wp-katex-eq": 0, + "align": 0, + "equation": 0, + "x-ck12": 0, + "texerror": 0, + } + + # Find and tag any \align environments + def start_callback(element): + regex = r"\\begin{align}(.*?)\\end{align}" + if element.node.type == 3: + text = element.node.text + matches = re.findall(regex, text, re.DOTALL) + for match in matches: + info["align"] += 1 + info["found_math"] = True + match = replacement_manager.add_replacement(match, tag="math") + text.replace(match, match) + element.node.text = text + + def end_callback(element): + pass + + body = tree.document.query_selector("body") + traverse_dom(body, start_callback, end_callback) + + # Find any \equation environments + def start_callback(element): + regex = r"\\begin{equation}(.*?)\\end{equation}" + if element.node.type == 3: + text = element.node.text + matches = re.findall(regex, text, re.DOTALL) + for match in matches: + info["equation"] += 1 + info["found_math"] = True + match = match.replace("\\begin{equation}", "") + match = match.replace("\\end{equation}", "") + wrapped_text = wrap_math(match, display=True) + wrapped_text = replacement_manager.add_replacement(wrapped_text, tag="math") + text = text.replace(match, wrapped_text) + # Remove the \begin{equation} and \end{equation} tags + text = text.replace("\\begin{equation}", "") + text = text.replace("\\end{equation}", "") + element.node.text = text + + def end_callback(element): + pass + + body = tree.document.query_selector("body") + traverse_dom(body, start_callback, end_callback) + + # Find all .texerror + texerrors = tree.document.query_selector_all(".texerror") + for texerror in texerrors: + + # Find the text between {} (maximum length) and replace the texerror with that text + match = re.search(r"\{(.{1,})\}", texerror.text) + if match: + info["found_math"] = True + info["texerror"] += 1 + wrapped_match = wrap_math(match.group(1)) + texerror.html = replacement_manager.add_replacement(wrapped_match, tag="math") + + # This has a ton of repeated code, but it's nice to have fine control over + # how each source is handled. + imgs = tree.document.query_selector_all("img") + for img in imgs: + + class_attr = img.getattr("class") + if class_attr is not None: + class_list = class_attr.split(" ") + if any([img_class in class_list for img_class in latex_image_class_names]): + alt = img.getattr("alt") + if alt is None: + continue + new_span = tree.create_element("span") + wrapped_alt = wrap_math(alt) + new_span.html = replacement_manager.add_replacement(wrapped_alt, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_alt.strip()) > 0: + info["found_math"] = True + info["img_math"] += 1 + + src = img.getattr("src") + if src is None: + continue + if "codecogs.com" in src: + try: + latex = src.split("?")[1:] + latex = "?".join(latex) # In case there are multiple ? in the latex + latex = unquote(latex) + new_span = tree.create_element("span") + wrapped_latex = wrap_math(latex) + new_span.html = replacement_manager.add_replacement(wrapped_latex, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_latex.strip()) > 0: + info["found_math"] = True + info["codecogs_latex"] += 1 + except: + pass + if "latex.php" in src: + try: + # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" + alt = img.getattr("alt") + if alt is None: + continue + # Unescape the latex + alt = unquote(alt) + # Get the latex + wrapped_alt = wrap_math(alt) + new_span = tree.create_element("span") + new_span.html = replacement_manager.add_replacement(wrapped_alt, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_alt.strip()) > 0: + info["found_math"] = True + info["wp_latex"] += 1 + except: + pass + if "/images/math/codecogs" in src: + try: + # they usually have "alt='-i u_t + \Delta u = |u|^2 u'" + alt = img.getattr("alt") + if alt is None: + continue + # Unescape the latex + alt = unquote(alt) + # Get the latex + wrapped_alt = wrap_math(alt) + new_span = tree.create_element("span") + new_span.html = replacement_manager.add_replacement(wrapped_alt, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_alt.strip()) > 0: + info["found_math"] = True + info["/images/math/codecogs"] += 1 + except: + pass + if "mimetex.cgi" in src: + try: + latex = src.split("?")[1:] + latex = "?".join(latex) # In case there are multiple ? in the latex + latex = unquote(latex) + new_span = tree.create_element("span") + wrapped_latex = wrap_math(latex) + new_span.html = replacement_manager.add_replacement(wrapped_latex, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_latex.strip()) > 0: + info["found_math"] = True + info["mimetex.cgi"] += 1 + except: + pass + if "mathtex.cgi" in src: + try: + latex = src.split("?")[1:] + latex = "?".join(latex) # In case there are multiple ? in the latex + latex = unquote(latex) + new_span = tree.create_element("span") + wrapped_latex = wrap_math(latex) + new_span.html = replacement_manager.add_replacement(wrapped_latex, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_latex.strip()) > 0: + info["found_math"] = True + info["mathtex.cgi"] += 1 + except: + pass + class_attr = img.getattr("class") + if class_attr is not None: + if "x-ck12" in class_attr: + try: + latex = img.getattr("alt") + latex = unquote(latex) + new_span = tree.create_element("span") + wrapped_latex = wrap_math(latex) + new_span.html = replacement_manager.add_replacement(wrapped_latex, tag="math") + parent = img.parent + parent.replace_child(new_span, img) + if len(wrapped_latex.strip()) > 0: + info["found_math"] = True + info["x-ck12"] += 1 + except: + pass + + # Find any blocks with class math-container and replace them with spans + math_containers = tree.document.query_selector_all(".math-container") + for math_container in math_containers: + text = math_container.text + new_span = tree.create_element("span") + wrapped_math = wrap_math(text, display=True) + new_span.html = replacement_manager.add_replacement(wrapped_math, tag="math") + parent = math_container.parent + parent.replace_child(new_span, math_container) + if len(wrapped_math.strip()) > 0: + info["found_math"] = True + info["math-container"] += 1 + + katex_inline_wp = tree.document.query_selector_all(".wp-katex-eq") + for katex in katex_inline_wp: + text = katex.text + new_span = tree.create_element("span") + display_attr = katex.getattr("data-display") + if display_attr is not None: + display = display_attr == "true" + else: + display = False + wrapped_math = wrap_math(text, display=display) + new_span.html = replacement_manager.add_replacement(wrapped_math, tag="math") + parent = katex.parent + parent.replace_child(new_span, katex) + if len(wrapped_math.strip()) > 0: + info["found_math"] = True + info["wp-katex-eq"] += 1 + + # Find all script[type="math/tex"] tags and replace them with spans + latex_script_tags = tree.document.query_selector_all('script[type="math/tex"]') + for script_tag in latex_script_tags: + text = script_tag.text + new_span = tree.create_element("span") + wrapped_text = wrap_math(text) + new_span.html = replacement_manager.add_replacement(wrapped_text, tag="math") + parent = script_tag.parent + parent.replace_child(new_span, script_tag) + if len(wrapped_text.strip()) > 0: + info["found_math"] = True + info["script_math_tex"] += 1 + + asciimath_script_tags = tree.document.query_selector_all('script[type="math/asciimath"]') + for script_tag in asciimath_script_tags: + try: + text = script_tag.text + new_span = tree.create_element("span") + wrapped_asciimath = wrap_math(extract_asciimath(text)) + new_span.html = replacement_manager.add_replacement(wrapped_asciimath, tag="math") + parent = script_tag.parent + parent.replace_child(new_span, script_tag) + if len(wrapped_asciimath.strip()) > 0: + info["found_math"] = True + info["script_math_asciimath"] += 1 + except: + # Delete this script tag + parent = script_tag.parent + parent.remove_child(script_tag) + + # For katex, find all elements with class = tex + katex_spans = tree.document.query_selector_all(".tex") + for katex_span in katex_spans: + try: + # Check if they have data-expr attr + expr = katex_span.getattr("data-expr") + if expr is None: + continue + # Replace with a span + new_span = tree.create_element("span") + wrapped_expr = wrap_math(expr) + new_span.html = replacement_manager.add_replacement(wrapped_expr, tag="math") + parent = katex_span.parent + parent.replace_child(new_span, katex_span) + if len(wrapped_expr.strip()) > 0: + info["found_math"] = True + info["katex"] += 1 + except: + pass + + # Find any spans with class "katex" + katex_spans = tree.document.query_selector_all("span.katex") + for katex_span in katex_spans: + # Find any spans with class "katex-html" and remove them + katex_html_spans = katex_span.query_selector_all("span.katex-html") + for katex_html_span in katex_html_spans: + parent = katex_html_span.parent + parent.remove_child(katex_html_span) + + # Remove any .MathJax_Preview spans + mathjax_preview_spans = tree.document.query_selector_all("span.MathJax_Preview") + for mathjax_preview_span in mathjax_preview_spans: + parent = mathjax_preview_span.parent + parent.remove_child(mathjax_preview_span) + + # Find any math tags + math_tags = tree.document.query_selector_all("math") + # For each math tag, see if there is an annotation tag with + # encoding="application/x-tex" inside it + for math_tag in math_tags: + annotation_tag = math_tag.query_selector('annotation[encoding="application/x-tex"]') + if annotation_tag is not None: + # Get the text content of the annotation tag + text = annotation_tag.text + # Set the content of the math tag to the text + # replace this math tag with a span tag with the text + # To do this, we need to get the parent of the math tag + parent = math_tag.parent + # Then, we need to create a new span tag + new_span = tree.create_element("span") + # Set the html of the new span tag to the text + wrapped_text = wrap_math(text) + new_span.html = replacement_manager.add_replacement(wrapped_text, tag="math") + # Then, we need to replace the math tag with the new span tag + parent.replace_child(new_span, math_tag) + # If the parent has style="display:none", then we need to + # remove the style attribute + style_value = parent.getattr("style") + if style_value is not None: + normalized_style_value = style_value.lower().strip().replace(" ", "").replace(";", "") + if "display:none" in normalized_style_value: + parent.delattr("style") + if len(wrapped_text.strip()) > 0: + info["found_math"] = True + info["math_annotations"] += 1 + # Check if the math tag has an alttext attribute + elif math_tag.getattr("alttext") is not None: + # Get the alttext attribute + alttext = math_tag.getattr("alttext") + new_span = tree.create_element("span") + # Set the html of the new span tag to the text + wrapped_alttext = wrap_math(alttext) + new_span.html = replacement_manager.add_replacement(wrapped_alttext, tag="math") + # Then, we need to replace the math tag with the new span tag + parent = math_tag.parent + parent.replace_child(new_span, math_tag) + if len(wrapped_alttext.strip()) > 0: + info["found_math"] = True + info["math_alttext"] += 1 + # Otherwise, translate the math tag to LaTeX + else: + try: + # Try translating to LaTeX + mathml = math_tag.html + # If this includes xmlns:mml, then we need to replace all + # instances of mml: with nothing + if "xmlns:mml" in mathml: + mathml = mathml.replace("mml:", "") + # replace xmlns:mml="..." with nothing + mathml = re.sub(r'xmlns:mml=".*?"', "", mathml) + latex = mml_to_latex(mathml) + # Make a new span tag + new_span = tree.create_element("span") + # Set the html of the new span tag to the text + wrapped_latex = wrap_math(latex) + new_span.html = replacement_manager.add_replacement(wrapped_latex, tag="math") + # Then, we need to replace the math tag with the new span tag + parent = math_tag.parent + parent.replace_child(new_span, math_tag) + if len(wrapped_latex.strip()) > 0: + info["found_math"] = True + info["mathml"] += 1 + except Exception as e: + parent = math_tag.parent + parent.remove_child(math_tag) + + mathjax_tags = tree.document.query_selector_all("mathjax") + for mathjax_tag in mathjax_tags: + # Get the inner text of the mathjax tag + text = mathjax_tag.text + text = html.unescape(text) + # Use regex to find text wrapped in hashes + matches = re.findall(r"#(.+?)#", text) + # For each match, replace the match with the LaTeX + for match in matches: + try: + latex = extract_asciimath(match) + # Replace the match with the LaTeX + text = text.replace(f"#{match}#", latex) + except Exception as e: + pass + + # Create a new span tag + new_span = tree.create_element("span") + # Set the html of the new span tag to the text + new_span.html = replacement_manager.add_replacement(text, tag="math") + # Then, we need to replace the mathjax tag with the new span tag + parent = mathjax_tag.parent + parent.replace_child(new_span, mathjax_tag) + if len(text.strip()) > 0: + info["found_math"] = True + info["mathjax_tag"] += 1 + + return tree, info + + +def remove_color(text): + return re.sub(color_regex, "", text) diff --git a/python/dolma/warc/openwebmath/line_processing.py b/python/dolma/warc/openwebmath/line_processing.py new file mode 100644 index 00000000..5908f538 --- /dev/null +++ b/python/dolma/warc/openwebmath/line_processing.py @@ -0,0 +1,82 @@ +import re + +from .constants import BOILERPLATE_WORDS + +edit_regex = r"\[(e|E)dit\]" + +BOILERPLATE_WORDS_MAP = {k: len(k.split()) for k in BOILERPLATE_WORDS} + + +def remove_empty_headers(lines, replacement_manager): + output_lines = [] + is_heading = [0] * len(lines) + for k in range(1, 7): + for i in range(len(lines)): + if replacement_manager.has_tag(lines[i], tag="h" + str(k)): + is_heading[i] = k + for i in range(len(lines)): + # Check if this line is a heading + if is_heading[i] != 0: + remove = False + # Go through the next lines until we find a line that is not a heading + j = i + 1 + while j < len(lines): + if is_heading[j] == 0 and len(lines[j]) > 16: + break + elif is_heading[j] != 0 and is_heading[j] <= is_heading[i]: + remove = True + break + j += 1 + # If we found a line that is not a heading, then we have a section + if j < len(lines) and not remove: + output_lines.append(lines[i]) + else: + output_lines.append(lines[i]) + # If there is at least one non-heading line, then we have a section + + return output_lines + + +def remove_edit_buttons(lines): + output_lines = [] + for line in lines: + if re.search(edit_regex, line): + output_lines.append(re.sub(edit_regex, "", line)) + else: + output_lines.append(line) + return output_lines + + +def remove_chinese_characters(lines): + output_lines = [] + for line in lines: + if re.match("[\u4e00-\u9fff]", line): + output_lines.append("") + else: + output_lines.append(line) + return output_lines + + +def remove_boilerplate(lines, boilerplate_config, replacement_manager): + output_lines = [] + maths = [replacement_manager.has_tag(line, tag="math") for line in lines] + codes = [replacement_manager.has_tag(line, tag="code") for line in lines] + for i in range(len(lines)): + lowered = lines[i].lower() + without_tags = replacement_manager.remove_tags(lowered) + s = sum([without_tags.count(word) * BOILERPLATE_WORDS_MAP[word] for word in BOILERPLATE_WORDS_MAP]) + # Compute the ratio of boilerplate words over the length of the line, and remove the line if this ratio is larger than the threshold + ratio = s / (len(without_tags.split()) + 0.001) + if ( + (ratio > boilerplate_config["ratio_threshold"] or s > boilerplate_config["absolute_threshold"]) + and not maths[i] + and not codes[i] + ): + if len(lines) - i < boilerplate_config["end_threshold"]: + for j in range(i, len(lines)): + if maths[j] or codes[j]: + output_lines.append(lines[j]) + break + else: + output_lines.append(lines[i]) + return output_lines diff --git a/python/dolma/warc/openwebmath/mmltex/README b/python/dolma/warc/openwebmath/mmltex/README new file mode 100644 index 00000000..6d173af1 --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/README @@ -0,0 +1,97 @@ +README for the XSLT MathML Library + +XSLT MathML Library is a set of XSLT stylesheets to transform +MathML 2.0 to LaTeX. + +For more information, see +http://www.raleigh.ru/MathML/mmltex/index.php?lang=en + +Manifest +-------- + +README this file +mmltex.xsl +tokens.xsl +glayout.xsl +scripts.xsl +tables.xsl +entities.xsl +cmarkup.xsl + +Use +--- + +There are two ways of using the library: + + * Use a local copy of the library. + + 1. Download the distribution (see below). + + 2. Unpack the distribution, using unzip. + + 3. In your stylesheet import or include either the main + stylesheet, mmltex.xsl, or the stylesheet module you + wish to use, such as tokens.xsl. This example assumes + that the distribution has been extracted into the same + directory as your own stylesheet: + + + + * Import or include either the main stylesheet, or the + stylesheet module you wish to use, directly from the library + website; http://www.raleigh.ru/MathML/mmltex/. For example: + + + +Obtaining The Library +--------------------- + +The XSLT MathML Library is available for download as: + + * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip + +Copyright +--------- + +Copyright (C) 2001, 2002 Vasil Yaroshevich + +Permission is hereby granted, free of charge, to any person +obtaining a copy of this software and associated documentation +files (the ``Software''), to deal in the Software without +restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, sublicense, and/or +sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +Except as contained in this notice, the names of individuals +credited with contribution to this software shall not be used in +advertising or otherwise to promote the sale, use or other +dealings in this Software without prior written authorization +from the individuals in question. + +Any stylesheet derived from this Software that is publically +distributed will be identified with a different name and the +version strings in any derived Software will be changed so that +no possibility of confusion between the derived package and this +Software will exist. + +Warranty +-------- + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER +CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +Contacting the Author +--------------------- + +These stylesheets are maintained by Vasil Yaroshevich, . diff --git a/python/dolma/warc/openwebmath/mmltex/cmarkup.xsl b/python/dolma/warc/openwebmath/mmltex/cmarkup.xsl new file mode 100644 index 00000000..c7a7219e --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/cmarkup.xsl @@ -0,0 +1,1093 @@ + + + + + + + + + + + + + + + + i + + + + + / + + + + + + _{} + + + + + e^{i + + } + + + + + E + + + + + + + + \mathrm{} + + + + + + + + + + + + + ( + + + , + + ) + + + + + () + + + + + + + \left( + + \left[ + + + , + + + + \right) + + \right] + + + + + \left\{\right\} + + + + + ^{(-1)} + + + + + + + + \mathrm{lambda}\: + + .\: + + + + + + + + + + \circ + + + + +\mathrm{id} + + + + \mathop{\mathrm{ + + }} + + + + + + + + \begin{cases} + + + \end{cases} + + + + + & \text{if $ + + $} + \\ + + + + + & \text{otherwise} + + + + + \left\lfloor\frac{ + + }{ + + }\right\rfloor + + + + + + + + ! + + + + + + + \left( + \frac{ + + + }{ + + + } + \right) + + + + + \ + + \{ + + + + , + + + + + + , + + + + \} + + + + + - + + + + + + + + + - + + + + + + + + + + ( + + + + + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + ) + + + + + + + + + ^{ + + + + } + + + + + + + \mod + + + + + + + + + + ( + + + + \times + + + + + + + + + + ) + + + + + \sqrt + + [ + + ] + + { + + } + + + +\gcd + + + + + + + + \land + + + + + + + + + + \lor + + + + + + + + + + \mathop{\mathrm{xor}} + + + + + + \neg + + + + + + + + + + \implies + + + + + + + + \ + + + + + , + + + \colon + + + + + + + \left| + + \right| + + + + + \overline{} + + + +\Re + + +\Im + + + + \lfloor + + \rfloor + + + + + \lceil + + \rceil + + + + + + + + + = + + + + + + + + + + \neq + + + + + + + + + + > + + + + + + + + + + < + + + + + + + + + + \ge + + + + + + + + + + \le + + + + + + + + + + \equiv + + + + + + + + + + \approx + + + + + + + + | + + + + + + + + \int + + _{ + + } + + + ^{ + + } + + + + \,d + + + + + + + ^\prime + + + + \frac{ + + + d^{ + + } + + }{d + + ^{ + + } + + + d + + }{d + + } + + + } + + + + + D_{ + + + , + + } + + + + + \frac{\partial^{ + + + + + + + + + + + + + + + + + + + + + } + + }{ + + \partial + + + ^{ + + } + + + } + + + + + + + + + , + + + +\mathop{\mathrm{div}} + + +\nabla^2 + + + + \{\} + + + + + \left[\right] + + + + + + + \colon + + + + + + , + + + + + + + + + + + + \cup + + + + + + + + + + \cap + + + + + + + + \in + + + + + + + + + + \notin + + + + + + + + + + + + \subseteq + + + + + + + + + + \subset + + + + + + + + + + \nsubseteq + + + + + + + + + + \not\subset + + + + + + + + + + \setminus + + + + + + | + + | + + + + + + + + + \times + + + + + + + + ^{ + + } + + + + + \sum + + + + + \prod + + + + + _{ + + + = + + + } + + + ^{ + + } + + + + + + + + \lim_{ + + } + + + + + + \to + + + + + + + + + + + + \searrow + \nearrow + \rightarrow + \to + + + + + + + + \ + + + + + + + + + \ + + + + + + \mathrm{ + + \,} + + + + + + + \mathrm{ + + } + + + + + e^{} + + + + + \lg + + + + + + + \log_{ + + } + + + + + + + + \langle + + + , + + \rangle + + + +\sigma + + + + \sigma( + + )^2 + + + + + \langle + + ^{ + + }\rangle + + _{ + + } + + + + + + + \left(\begin{array}{c} + + + \\ + + \end{array}\right) + + + + + \begin{pmatrix} + + \end{pmatrix} + + + + + + + & + + \\ + + + + + \det + + + + + + + \begin{vmatrix} + + \end{vmatrix} + + + + + + + + ^T + + + + + + + + _{ + + + , + + } + + + + + + + + + \dot + + + + + + + + + + + +\mathbb{Z} + + +\mathbb{R} + + +\mathbb{Q} + + +\mathbb{N} + + +\mathbb{C} + + +\mathbb{P} + + +e + + +i + + +NaN + + +\mbox{true} + + +\mbox{false} + + +\emptyset + + +\pi + + +\gamma + + +\infty + + + + + + + ( + + + + + + + + + ) + + + + + + + ( + + + + + + + + ) + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/entities.xsl b/python/dolma/warc/openwebmath/mmltex/entities.xsl new file mode 100644 index 00000000..75bebf60 --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/entities.xsl @@ -0,0 +1,316 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/glayout.xsl b/python/dolma/warc/openwebmath/mmltex/glayout.xsl new file mode 100644 index 00000000..021a4ef4 --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/glayout.xsl @@ -0,0 +1,220 @@ + + + + + + + + + + + + + + \genfrac{}{}{ + + + + ex + + + .05ex + + + + .2ex + + + + + + }{}{ + + + \frac{ + + + + \hfill + + + + \hfill + + }{ + + \hfill + + + + \hfill + + } + + + + + + \sqrt[ + + ]{ + + } + + + + exception 25: + \text{exception 25:} + + + + + + \sqrt{ + + } + + + + + + + \left + + + \ + + + + \left( + + + + + + + + + + + , + + + + + + + + + + + + + + + + + + + + + + + + \right + + + \ + + + + \right) + + + + + \phantom{ + + } + + + + + + \overline{ + + \hspace{.2em}|} + + + \sqrt{ + + } + + + \overline{) + + } + + + + + + + + + + + \colorbox[rgb]{ + + + + }{$ + + + \textcolor[rgb]{ + + + + }{ + + + + } + + + $} + + + + + + + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/mmltex.xsl b/python/dolma/warc/openwebmath/mmltex/mmltex.xsl new file mode 100644 index 00000000..a7d735c5 --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/mmltex.xsl @@ -0,0 +1,45 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + $ + + $ + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/scripts.xsl b/python/dolma/warc/openwebmath/mmltex/scripts.xsl new file mode 100644 index 00000000..fcaab18b --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/scripts.xsl @@ -0,0 +1,292 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + \overline{ + + + + + } + + + \overbrace{ + + + + + } + + + \underline{ + + + + + + } + + + \underbrace{ + + + + + + } + + + + + _{ + + }^{ + + } + + + \underset{ + + }{\overset{ + + }{ + + }} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \overline{ + + } + + + \overbrace{ + + } + + + + + ^{ + + } + + + \stackrel{ + + }{ + + } + + + + + + + + + + + \underline{ + + } + + + \underbrace{ + + } + + + + + _{ + + } + + + \underset{ + + }{ + + } + + + + + + { + + }_{ + + }^{ + + } + + + + { + + }^{ + + } + + + + { + + }_{ + + } + + + + + + {}_{ + + } + + + {}^{ + + } + + + + + + {} + + + _{ + + } + + + ^{ + + } + + + + + + + + + + + + + + {} + + + _{ + + } + + + ^{ + + } + + + + + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/tables.xsl b/python/dolma/warc/openwebmath/mmltex/tables.xsl new file mode 100644 index 00000000..ad1a10a0 --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/tables.xsl @@ -0,0 +1,130 @@ + + + + + + + + + \multicolumn{ + + }{c}{ + + } + + & + + + + + + + \hfill + + + + \hfill + + + + & + + + + + + + \\ + + + + + \begin{array}{ + + | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + | + + } + + \hline + + + + \\ \hline + + \end{array} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/mmltex/tokens.xsl b/python/dolma/warc/openwebmath/mmltex/tokens.xsl new file mode 100644 index 00000000..0d1d750d --- /dev/null +++ b/python/dolma/warc/openwebmath/mmltex/tokens.xsl @@ -0,0 +1,296 @@ + + + + + + + + + + + + + + + \mathrm{ + + } + + + + + + + + + + + + + + + + + + + + + + \text{ + + } + + + + \phantom{\rule + + [- + + ] + + { + + 0ex + + + }{ + + 0ex + + + }} + + + + + + " + + + " + + + + + + \colorbox[rgb]{ + + + + }{$ + + + \textcolor[rgb]{ + + + + }{ + + + + + \mathrm{ + + + \mathbf{ + + + \mathit{ + + + \mathbit{ + + + \mathbb{ + + + { + + + \mathcal{ + + + \mathsc{ + + + \mathfrak{ + + + \mathsf{ + + + \mathbsf{ + + + \mathsfit{ + + + \mathbsfit{ + + + \mathtt{ + + + { + + + + + + } + + + } + + + $} + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + , + + + + + + , + + + + + + + + + + + + + + + + + + + , + + + + + + + + + + + , + + + + + + + + + + + + + + 0,1,1 + 0,0,0 + 0,0,1 + 1,0,1 + .5,.5,.5 + 0,.5,0 + 0,1,0 + .5,0,0 + 0,0,.5 + .5,.5,0 + .5,0,.5 + 1,0,0 + .75,.75,.75 + 0,.5,.5 + 1,1,1 + 1,1,0 + + Exception at color template + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Exception at Hex2Decimal template + + + + + + + + + + + \ No newline at end of file diff --git a/python/dolma/warc/openwebmath/tree_processing.py b/python/dolma/warc/openwebmath/tree_processing.py new file mode 100644 index 00000000..20b668b8 --- /dev/null +++ b/python/dolma/warc/openwebmath/tree_processing.py @@ -0,0 +1,400 @@ +from resiliparse.parse.html import DOMCollection +from tabulate import tabulate +from text_extract.utils import has_style + +header_to_format = {f"h{i}": f"[heading_{i}]" for i in range(1, 7)} + + +def remove_buttons(tree): + btns = tree.document.query_selector_all(".btn") + for btn in btns: + parent = btn.parent + parent.remove_child(btn) + # Remove any button tags + btns = tree.document.query_selector_all("button") + for btn in btns: + parent = btn.parent + if parent: + parent.remove_child(btn) + + +def remove_links(tree): + """Replace links with spans so that resiliparse doesn't try to remove them.""" + links = tree.document.query_selector_all("a") + for link in links: + parent = link.parent + if parent is None: + continue + new_span = tree.create_element("span") + new_span.text = link.text + parent.replace_child(new_span, link) + + +def flatten(node): + """Remove any divs or spans that only have one child and replace them with their child.""" + divs = node.query_selector_all("div") + spans = node.query_selector_all("span") + for div in divs: + if len(div.child_nodes) == 1: + parent = div.parent + if parent is None: + continue + parent.replace_child(div.child_nodes[0], div) + for span in spans: + if len(span.child_nodes) == 1: + parent = span.parent + if parent is None: + continue + parent.replace_child(span.child_nodes[0], span) + + return node + + +def remove_dense_links(tree): + """Remove lists that only have links.""" + # First, remove any nav elements to be safe. + navs = tree.document.query_selector_all("nav") + for nav in navs: + parent = nav.parent + if parent is None: + continue + parent.remove_child(nav) + + lists = tree.document.query_selector_all("ul, ol, div, span, nav, table, p") + to_remove = [] + for _list in lists: + if len(_list.child_nodes) == 0 or len(_list.child_nodes) == 1: + continue + children = _list.child_nodes + links = _list.query_selector_all("a") + total_children_text = "".join([x.text.strip() for x in children if type(x) != DOMCollection]) + total_links_text = "".join([x.text.strip() for x in links]) + if len(total_children_text) == 0 or len(total_links_text) == 0: + continue + ratio = len(total_links_text) / len(total_children_text) + if ratio > 0.8: + parent = _list.parent + if parent is None: + continue + to_remove.append(_list) + + for _list in to_remove: + parent = _list.parent + if parent is None: + continue + parent.remove_child(_list) + + +def remove_image_figures(tree): + to_remove = [] + imgs = tree.document.query_selector_all("img") + for img in imgs: + cur_node = img + while cur_node is not None: + if cur_node.class_name == "figure": + parent = cur_node.parent + if parent: + to_remove.append(cur_node) + break + cur_node = cur_node.parent + + for node in to_remove: + parent = node.parent + if parent is None: + continue + parent.remove_child(node) + + +def remove_link_clusters(tree): + # First, find all links that are in span blocks. If they have no siblings, delete the span. + to_remove = [] + + span_links = tree.document.query_selector_all("span a") + for link in span_links: + parent = link.parent + if parent is None: + continue + n_siblings = 0 + for sibling in parent.child_nodes: + if sibling.type == 1: + n_siblings += 1 + break + if n_siblings == 1: + grandparent = parent.parent + if grandparent is None: + continue + # grandparent.remove_child(parent) + to_remove.append(parent) + + links = list(tree.document.query_selector_all("a")) + + i = 0 + while len(links) > 0: + link = links[0] + del links[0] + parent = link.parent + i += 1 + if parent is None or parent.parent is None: + continue + n_links = 0 + n_children = len(parent.child_nodes) + child_links = parent.query_selector_all("a") + if len(child_links) == n_children: + for child_link in child_links: + # Check if it's visible and not empty. + empty = child_link.text is None or child_link.text.strip() == "" + styles = child_link.getattr("style") + visible = styles is None or not ( + has_style("display: none", styles) or has_style("visibility: hidden", styles) + ) + if visible and not empty: + n_links += 1 + multilink = n_links > 1 and n_children == n_links + if multilink: + grandparent = parent.parent + if grandparent is None: + continue + # grandparent.remove_child(parent) + to_remove.append(parent) + + for node in to_remove: + parent = node.parent + if parent is None: + continue + parent.remove_child(node) + + +def extract_code(tree, replacement_manager): + wp_syntax = tree.document.query_selector_all(".wp_syntax") + codes = tree.document.query_selector_all("code") + code_responsive = tree.document.query_selector_all(".code_responsive") + pre_tags = tree.document.query_selector_all("pre") + for code in [*wp_syntax, *codes, *code_responsive, *pre_tags]: + multiline = code.text.count("\n") > 0 + if len(code.text) > 0: + if multiline: + code.text = replacement_manager.add_replacement(f"```{code.text}```", tag="code") + else: + code.text = replacement_manager.add_replacement(f"`{code.text}`", tag="code") + + +def extract_tables(node, replacement_manager, table_config): + if table_config["format"] == "none": + return + # Don't worry about tables that have tables in them or have headers + # tables = node.query_selector_all('table:not(:has(table *))') + tables = node.query_selector_all("table:not(:has(table, h1, h2, h3, h4, h5, h6))") + for table in tables: + table_data = [] + headers = [] + # Find all headers + ths = table.query_selector_all("th") + for th in ths: + headers.append(th.text) + trs = table.query_selector_all("tr") + for tr in trs: + row_data = [] + tds = tr.query_selector_all("td") + for td in tds: + # Remove any scripts + scripts = td.query_selector_all("script") + for script in scripts: + script.parent.remove_child(script) + # Get the text of each td element + row_data.append(td.text) + col_span = td.getattr("colspan") + if col_span: + try: + col_span = int(col_span) + if col_span > 100: + continue + except ValueError: + continue + # Add empty cells for colspans + for _ in range(col_span - 1): + row_data.append("") + table_data.append(row_data) + if len(table_data) == 0 or len(table_data[0]) == 0: + continue + # Post processing + # Make sure all rows have the same number of columns + max_cols = max([len(row) for row in table_data]) + for row in table_data: + if len(row) < max_cols: + row.extend([""] * (max_cols - len(row))) + # Strip all cells + for i in range(len(table_data)): + for j in range(len(table_data[i])): + table_data[i][j] = table_data[i][j].strip() + # If any columns or rows are consistently empty, remove them + # Remove empty columns + empty_columns = [] + for i in range(len(table_data[0])): + if all([len(row[i]) == 0 for row in table_data]): + empty_columns.append(i) + + for i in reversed(empty_columns): + for row in table_data: + del row[i] + # Remove empty rows + table_data = [row for row in table_data if len(row) > 0] + + # Remove any newlines from the table + for i in range(len(table_data)): + for j in range(len(table_data[i])): + table_data[i][j] = table_data[i][j].replace("\n", " ") + # Check that the table has at least one row and one column + if len(table_data) >= table_config["min_rows"] and len(table_data[0]) >= table_config["min_cols"]: + # Replace the table with a markdown + parent = table.parent + if parent: + if len(headers) == 0: + headers = [""] * len(table_data[0]) + rendered_table = tabulate(table_data, tablefmt=table_config["format"], headers=headers) + table.html = replacement_manager.add_replacement(rendered_table, tag="table") + elif len(table_data) > 0 and len(table_data[0]) > 0: + # Do the same but use a plain format + # Replace the table with a markdown + parent = table.parent + if parent: + if len(headers) == 0: + headers = [""] * len(table_data[0]) + rendered_table = tabulate(table_data, tablefmt="plain", headers=headers) + table.html = replacement_manager.add_replacement(rendered_table, tag="table") + else: + # Remove empty tables + if table.parent: + table.parent.remove_child(table) + + return node + + +def extract_headings(tree, replacement_manager, markdown_formatting): + to_remove = [] + for heading_tag in header_to_format: + hs = tree.document.query_selector_all(heading_tag) + for heading in hs: + text = "" + for child in heading.child_nodes: + if child.text.strip() != "" and child.type != 8: + text += child.text + child.text = "" + text = text.strip() + if len(text) == 0: + # remove the heading + if heading.parent: + to_remove.append(heading) + continue + if markdown_formatting: + heading.text = replacement_manager.add_replacement( + header_to_format[heading_tag] + " " + text + "\n\n", tag=heading_tag + ) + else: + heading.text = replacement_manager.add_replacement(text + "\n\n", tag=heading_tag) + + for heading in to_remove: + parent = heading.parent + if parent: + parent.remove_child(heading) + + +def post_process_headings(text): + """Replace [heading_i] with '#' * i""" + for i in range(6, 0, -1): + text = text.replace("[heading_%d]" % i, "#" * i) + return text + + +def add_se_separators(tree): + user_infos = tree.document.query_selector_all("table.fw") + # Replace all of these with spans - + for user_info in user_infos: + new_span = tree.create_element("span") + new_span.text = "-" + parent = user_info.parent + # Remove the table + parent.remove_child(user_info) + # Add the span + parent.append_child(new_span) + + +def wikipedia_preprocess(tree): + external_links = tree.document.query_selector("#External_links") + if external_links: + # Remove all next until nothing left + node = external_links.parent.next + while node: + next = node.next + node.parent.remove_child(node) + node = next + external_links.parent.remove_child(external_links) + + edit_buttons = tree.document.query_selector_all(".mw-editsection") + for edit_button in edit_buttons: + if edit_button.parent: + edit_button.parent.remove_child(edit_button) + + +def remove_display_none(tree): + # Remove all elements with display none + elements = tree.document.query_selector_all('[style*="display:none"]') + for element in elements: + element.parent.remove_child(element) + + +def preserve_question_headers(tree): + elements = tree.document.query_selector_all("#question-header") + for element in elements: + inner_h1 = element.query_selector("h1") + if inner_h1: + new_h1 = tree.create_element("h1") + new_h1.text = inner_h1.text + element.parent.replace_child(new_h1, element) + + +def main_content_preprocess(tree): + """Make any changes that are necessary to maximize the performance + of the resiliparse main_content=True option.""" + + # Look for qa-main class + qa_main = tree.document.query_selector(".qa-main") + if qa_main: + qa_main.setattr("class", "article-body") + + # If there is a role=main and a question-header class, add the question-header to the top of the role=main + role_main = tree.document.query_selector('[role="main"]') + if role_main: + question_header = tree.document.query_selector("#question-header") + if question_header: + first_child = role_main.first_child + if first_child: + role_main.insert_before(question_header, first_child) + + post_content = tree.document.query_selector(".postcontent") + if post_content: + post_body = tree.document.query_selector(".postbody") + if post_body: + # Set the class of postbody to postcontent and remove the postcontent class + post_body.setattr("class", "postcontent") + post_content.setattr("class", "") + + # Find .postbit + postbit = tree.document.query_selector(".postbit") + if postbit: + # Change the class to article-body + postbit.setattr("class", "") + + # Find all ul and add a few wrapping divs to move them farther from the root node + uls = tree.document.query_selector_all("ul") + for ul in uls: + # Create 4 nested divs and set the html of the last one to the html of the ul. Then replace the ul with the last div + div1 = tree.create_element("div") + div2 = tree.create_element("div") + div3 = tree.create_element("div") + div4 = tree.create_element("div") + div4.html = ul.html + div3.append_child(div4) + div2.append_child(div3) + div1.append_child(div2) + if ul.parent: + ul.parent.replace_child(div1, ul) diff --git a/python/dolma/warc/openwebmath/utils.py b/python/dolma/warc/openwebmath/utils.py new file mode 100644 index 00000000..c1885c87 --- /dev/null +++ b/python/dolma/warc/openwebmath/utils.py @@ -0,0 +1,105 @@ +import re + +import numpy as np +import yaml + + +def has_style(style, styles): + """Does the style string contain any of the styles? + This function is robust to variations in the spaces between the styles. + """ + # Remove any spaces. + style = style.replace(" ", "") + styles = [s.replace(" ", "") for s in styles] + for s in styles: + if s in style: + return True + return False + + +def word_wrap(text, char_width=20): + """Wrap text to a given width, not breaking words.""" + if not text: + return "" + + words = text.split() + lines = [] + current_line = [] + + for word in words: + if len(" ".join(current_line + [word])) <= char_width: + current_line.append(word) + else: + if current_line: # Check if current_line is not empty + lines.append(" ".join(current_line)) + current_line = [word] + + # Handle the case when the word is longer than the character width + while len(current_line[0]) > char_width: + lines.append(current_line[0][:char_width]) + current_line[0] = current_line[0][char_width:] + + if current_line: + lines.append(" ".join(current_line)) + + return "\n".join(lines) + + +class ReplacementManager: + """This replacement manager simply adds tags next to the instances of the text. + It contains a method to remove these tags.""" + + def __init__(self): + self.tags = [] + + def add_replacement(self, text, tag="default"): + self.tags.append(tag) + return f"§§{tag}§§" + text + + def remove_tags(self, text): + tag_regex = "|".join(f"§§{tag}§§" for tag in self.tags) + return re.sub(tag_regex, "", text) + + def has_tag(self, text, tag): + return f"§§{tag}§§" in text + + +class Config: + """A simple config object that loads a config from a YAML file and + presents as a dictionary""" + + def __init__(self, config_file): + with open(config_file, "r") as f: + self.config = yaml.safe_load(f) + + def sample_from_list(self, list): + """Sample from a list of (probability, value) tuples.""" + probabilities = [p for p, _ in list] + values = [v for _, v in list] + probabilities = np.array(probabilities) + probabilities /= probabilities.sum() + return np.random.choice(values, p=probabilities) + + def _sample(self, config): + # For every value that has a type of list, first check it is in the format of: + # - (probability, value) + # - (probability, value) + # - ... + # And the probabilities sum to 1. + # Then sample from the list. + sampled_config = {} + for key, value in config.items(): + # print the type of the value + if isinstance(value, list): + # Check the format of the list. + # Check the probabilities sum to 1. + # Sample from the list. + sampled_config[key] = self.sample_from_list(value) + elif isinstance(value, dict): + sampled_config[key] = self._sample(value) + else: + sampled_config[key] = value + return sampled_config + + def sample(self): + return self._sample(self.config) From cbc448dd56f6dd1cbee6cd61e79591b15436ad27 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 19:47:29 +0000 Subject: [PATCH 009/169] pipeline --- configs/crawl/science.yaml | 6 ++- pyproject.toml | 2 +- python/dolma/taggers/language.py | 2 +- python/dolma/warc/openwebmath/__init__.py | 14 ++++--- python/dolma/warc/openwebmath/config.py | 41 ------------------- .../openwebmath/configs/randomize_all.yaml | 35 ++++++++++++++++ .../dolma/warc/openwebmath/tree_processing.py | 3 +- python/dolma/warc/openwebmath/utils.py | 7 +++- 8 files changed, 56 insertions(+), 54 deletions(-) delete mode 100644 python/dolma/warc/openwebmath/config.py create mode 100644 python/dolma/warc/openwebmath/configs/randomize_all.yaml diff --git a/configs/crawl/science.yaml b/configs/crawl/science.yaml index bec9f948..f74dc0c5 100644 --- a/configs/crawl/science.yaml +++ b/configs/crawl/science.yaml @@ -3,7 +3,9 @@ destination: - ${oc.env:HOME}/science/CC-MAIN-2024-18/documents processes: ${d.procs:} source_name: cccc_CC-MAIN-2024-18 -linearizer: openwebmath +# linearizer: openwebmath +linearizer: resiliparse + pre: taggers: - owm_math_v1 @@ -16,7 +18,7 @@ post: - ft_dolma_doc_eng skip: true -store_html_in_metadata: false +store_html_in_metadata: true work_dir: input: /tmp/science/CC-MAIN-2024-18/input diff --git a/pyproject.toml b/pyproject.toml index e85995e4..bc601217 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -185,7 +185,7 @@ features = ["pyo3/extension-module"] where = ["src"] [tool.setuptools.package-data] -dolma = ["py.typed", "data/*", "warc/openwebmath/mmltex/*"] +dolma = ["py.typed", "data/*", "warc/openwebmath/mmltex/*.xsl", "warc/openwebmath/configs/*.yaml"] [tool.black] line-length = 115 diff --git a/python/dolma/taggers/language.py b/python/dolma/taggers/language.py index 2dab5390..6feae86d 100644 --- a/python/dolma/taggers/language.py +++ b/python/dolma/taggers/language.py @@ -184,7 +184,7 @@ def predict_text(self, text: str) -> List[Tuple[str, float]]: @TaggerRegistry.add("ft_dolma_doc_eng") class FastTextEnglishDolmaTagger(FastTextEnglishLanguageDocumentTagger): - INCLUDE_NEGATIVE = True + INCLUDE_NEGATIVE = False PREDICT_ON_PARAGRAPHS = False def predict_text(self, text: str) -> List[Tuple[str, float]]: diff --git a/python/dolma/warc/openwebmath/__init__.py b/python/dolma/warc/openwebmath/__init__.py index 68a5cf7e..6270c76a 100644 --- a/python/dolma/warc/openwebmath/__init__.py +++ b/python/dolma/warc/openwebmath/__init__.py @@ -2,22 +2,24 @@ from typing import Optional from necessary import necessary -from omegaconf import OmegaConf as om -from .config import OpenWebMathConfig +from .utils import Config -__all__ = ["Extractor", "OpenWebMathConfig"] +__all__ = ["Extractor", "Config"] class Extractor: - def __init__(self, config: Optional[OpenWebMathConfig] = None): + + def __init__(self, config: Optional[Config] = None): necessary("tabulate", message="{module_name} not available; please install dolma[openwebmath]") necessary("py_asciimath", message="{module_name} not available; please install dolma[openwebmath]") necessary("lxml", message="{module_name} not available; please install dolma[openwebmath]") from .extract import extract_text - parsed_config = om.to_object(config or om.structured(OpenWebMathConfig)) - self._extract_fn = partial(extract_text, config=parsed_config, fast=False) + # create a config, merge it with empty dictionary to make sure it is casted to a python dict + config = config or Config() + + self._extract_fn = partial(extract_text, config=config.sample(), fast=False) def extract_text(self, html: str) -> str: out = self._extract_fn(html) diff --git a/python/dolma/warc/openwebmath/config.py b/python/dolma/warc/openwebmath/config.py deleted file mode 100644 index 97a8712c..00000000 --- a/python/dolma/warc/openwebmath/config.py +++ /dev/null @@ -1,41 +0,0 @@ -from dataclasses import dataclass, field -from typing import List, NamedTuple - - -class ManagerTuple(NamedTuple): - score: float - activate: bool - - -class ThresholdTuple(NamedTuple): - lo: float - hi: float - - -@dataclass -class BoilerPlateConfig: - ratio_threshold: List[ThresholdTuple] = [ThresholdTuple(0.9, 0.18), ThresholdTuple(0.1, 0.30)] - absolute_threshold: List[ThresholdTuple] = [ThresholdTuple(0.9, 10), ThresholdTuple(0.1, 20)] - end_threshold: List[ThresholdTuple] = [ThresholdTuple(0.95, 15), ThresholdTuple(0.05, 5)] - enable: List[ManagerTuple] = [ManagerTuple(0.95, True), ManagerTuple(0.05, False)] - - -@dataclass -class TableConfig: - min_rows: int = 2 - min_cols: int = 3 - format: str = "plain" - - -@dataclass -class OpenWebMathConfig: - markdown_headings: List[ManagerTuple] = [ManagerTuple(0.9, True), ManagerTuple(0.1, False)] - markdown_code: List[ManagerTuple] = [ManagerTuple(0.95, True), ManagerTuple(0.05, False)] - boilerplate_config: BoilerPlateConfig = field(default_factory=BoilerPlateConfig) - remove_buttons: bool = True - remove_image_figures: bool = True - remove_link_clusters: bool = True - table_config: TableConfig = field(default_factory=TableConfig) - remove_chinese: bool = True - remove_edit_buttons: bool = True - extract_latex: bool = True diff --git a/python/dolma/warc/openwebmath/configs/randomize_all.yaml b/python/dolma/warc/openwebmath/configs/randomize_all.yaml new file mode 100644 index 00000000..19f684b3 --- /dev/null +++ b/python/dolma/warc/openwebmath/configs/randomize_all.yaml @@ -0,0 +1,35 @@ +markdown_headings: [ + [0.9, True], + [0.1, False], + ] +markdown_code: [ + [0.95, True], + [0.05, False], + ] +boilerplate_config: + ratio_threshold: [ + [0.9, 0.18], + [0.1, 0.30], + ] + absolute_threshold: [ + [0.9, 10], + [0.1, 20], + ] + end_threshold: [ + [0.95, 15], + [0.05, 5], + ] + enable: [ + [0.95, True], + [0.05, False], + ] +remove_buttons: True +remove_image_figures: True +remove_link_clusters: True +table_config: + min_rows: 2 + min_cols: 3 + format: 'plain' +remove_chinese: True +remove_edit_buttons: True +extract_latex: True diff --git a/python/dolma/warc/openwebmath/tree_processing.py b/python/dolma/warc/openwebmath/tree_processing.py index 20b668b8..d05fd037 100644 --- a/python/dolma/warc/openwebmath/tree_processing.py +++ b/python/dolma/warc/openwebmath/tree_processing.py @@ -1,6 +1,7 @@ from resiliparse.parse.html import DOMCollection from tabulate import tabulate -from text_extract.utils import has_style + +from .utils import has_style header_to_format = {f"h{i}": f"[heading_{i}]" for i in range(1, 7)} diff --git a/python/dolma/warc/openwebmath/utils.py b/python/dolma/warc/openwebmath/utils.py index c1885c87..8f14af74 100644 --- a/python/dolma/warc/openwebmath/utils.py +++ b/python/dolma/warc/openwebmath/utils.py @@ -1,8 +1,11 @@ import re +from pathlib import Path import numpy as np import yaml +CONFIG_PATH = Path(__file__).parent / "configs/randomize_all.yaml" + def has_style(style, styles): """Does the style string contain any of the styles? @@ -68,8 +71,8 @@ class Config: """A simple config object that loads a config from a YAML file and presents as a dictionary""" - def __init__(self, config_file): - with open(config_file, "r") as f: + def __init__(self, config_file=CONFIG_PATH): + with open(config_file, "rt") as f: self.config = yaml.safe_load(f) def sample_from_list(self, list): From db2491ca2d04fb87db88ab607dc21e5a601fa06e Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 20:01:28 +0000 Subject: [PATCH 010/169] errors --- python/dolma/warc/processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index c59f6f51..f3c2a8e6 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -156,7 +156,7 @@ def process_single( if record.http_charset: try: decoded_content = content.decode(record.http_charset).strip() - except UnicodeDecodeError: + except (UnicodeDecodeError, LookupError): decoded_content = "" if not decoded_content and (encoding := detect(content)["encoding"]): decoded_content = content.decode(str(encoding)).strip() From 149fea8a5bc6719e3125578f917c537db5d7ec29 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 20:13:12 +0000 Subject: [PATCH 011/169] added backoff --- pyproject.toml | 4 ++-- python/dolma/warc/processor.py | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bc601217..746cd885 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -128,8 +128,8 @@ warc = [ "fastwarc", "w3lib", "url-normalize", - "dateparser" - + "dateparser", + "backoff" ] trafilatura = [ # must include warc dependencies diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index f3c2a8e6..288837fc 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -30,6 +30,10 @@ if DATEPARSER_AVAILABLE or TYPE_CHECKING: import dateparser +with necessary("backoff", soft=True) as BACKOFF_AVAILABLE: + if BACKOFF_AVAILABLE or TYPE_CHECKING: + import backoff + DATE_FORMATS = ["%a, %d %b %Y %H:%M:%S %Z", "%Y-%m-%dT%H:%M:%SZ"] @@ -39,10 +43,9 @@ class WarcProcessor(BaseParallelProcessor): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if not FASTWARC_AVAILABLE: - raise_warc_dependency_error("fastwarc") - if not DATEPARSER_AVAILABLE: - raise_warc_dependency_error("dateparser") + assert FASTWARC_AVAILABLE, raise_warc_dependency_error("fastwarc") + assert DATEPARSER_AVAILABLE, raise_warc_dependency_error("dateparser") + assert BACKOFF_AVAILABLE, raise_warc_dependency_error("backoff") @staticmethod def _format_to_dolma_timestamp(timestamp: Optional[datetime.datetime] = None) -> str: @@ -80,6 +83,21 @@ def process_single( destination_path: str, queue: QueueType, **kwargs, + ): + max_time = kwargs.pop('backoff_max_time', None) or 10 ** 60 + max_tries = kwargs.pop('backoff_max_tries', None) or 10 + fn = backoff.on_exception(backoff.expo, Exception, max_time=max_time, max_tries=max_tries)( + cls._process_single_without_backoff, + ) + return fn(source_path, destination_path, queue, **kwargs) + + @classmethod + def _process_single_without_backoff( + cls, + source_path: str, + destination_path: str, + queue: QueueType, + **kwargs, ): """Lets extract from a single WARC file.""" From 3bde00fc03cdbc2271e24a866e36317483029c01 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 20:13:22 +0000 Subject: [PATCH 012/169] added backoff --- python/dolma/warc/processor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index 288837fc..385a1f77 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -84,8 +84,8 @@ def process_single( queue: QueueType, **kwargs, ): - max_time = kwargs.pop('backoff_max_time', None) or 10 ** 60 - max_tries = kwargs.pop('backoff_max_tries', None) or 10 + max_time = kwargs.pop("backoff_max_time", None) or 10**60 + max_tries = kwargs.pop("backoff_max_tries", None) or 10 fn = backoff.on_exception(backoff.expo, Exception, max_time=max_time, max_tries=max_tries)( cls._process_single_without_backoff, ) From 3903c7be93d66ac6338b35ee10a0abfc89acddbd Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 20:30:27 +0000 Subject: [PATCH 013/169] logging --- python/dolma/warc/processor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index 385a1f77..aaf76a76 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -1,4 +1,5 @@ import datetime +import logging import multiprocessing import tempfile from contextlib import ExitStack @@ -86,7 +87,8 @@ def process_single( ): max_time = kwargs.pop("backoff_max_time", None) or 10**60 max_tries = kwargs.pop("backoff_max_tries", None) or 10 - fn = backoff.on_exception(backoff.expo, Exception, max_time=max_time, max_tries=max_tries)( + (logger := cls.get_logger()).setLevel(logging.WARNING) + fn = backoff.on_exception(backoff.expo, Exception, max_time=max_time, max_tries=max_tries, logger=logger)( cls._process_single_without_backoff, ) return fn(source_path, destination_path, queue, **kwargs) From 875b8cbf8e6c4904b96ab218b0f8a109dd2b6300 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Fri, 10 May 2024 20:32:42 +0000 Subject: [PATCH 014/169] unused --- python/dolma/core/registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/dolma/core/registry.py b/python/dolma/core/registry.py index e0edf754..3f2fa856 100644 --- a/python/dolma/core/registry.py +++ b/python/dolma/core/registry.py @@ -61,7 +61,7 @@ def _add( tagger_cls._get_storage()[tagger_name] = (tagger_self, tagger_desc) return tagger_self - return _add # type: ignore + return _add @classmethod def remove(cls, name: str) -> bool: From 302e0a5e182a9caf0896d4c0d82f88e5e88c1174 Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sat, 11 May 2024 00:31:24 +0000 Subject: [PATCH 015/169] url fixes --- python/dolma/cli/warc.py | 5 +++-- python/dolma/core/parallel.py | 8 +++++++- python/dolma/core/paths.py | 7 ++++++- python/dolma/warc/processor.py | 9 ++++++--- python/dolma/warc/utils.py | 22 +++++++++++++--------- 5 files changed, 35 insertions(+), 16 deletions(-) diff --git a/python/dolma/cli/warc.py b/python/dolma/cli/warc.py index 50ca838e..fe6f8957 100644 --- a/python/dolma/cli/warc.py +++ b/python/dolma/cli/warc.py @@ -65,7 +65,7 @@ class WarcExtractorConfig: default=False, help="If true, only print the configuration and exit without running the pipieline.", ) - check: bool = field(default=True, help="If true, check if input documents are valid paths before running the") + skip_checks: bool = field(default=False, help="If true, skip checks on paths (e.g. validation, globbing). Useful in case many paths are being evaluated.") class WarcExtractorCli(BaseCli): @@ -84,7 +84,7 @@ def run(cls, parsed_config: WarcExtractorConfig): if not isinstance(source_name, str): raise ValueError(f"source_name must be a string, not {source_name} ({type(source_name)})") - if parsed_config.check: + if not parsed_config.skip_checks: # perform some path validation to make sure we don't call the warc # extractor with an invalid config total_matching_documents = 0 @@ -118,4 +118,5 @@ def run(cls, parsed_config: WarcExtractorConfig): skip_no_post_taggers=parsed_config.post.skip, store_html_in_metadata=parsed_config.store_html_in_metadata, linearizer_name=parsed_config.linearizer, + skip_source_glob=parsed_config.skip_checks, ) diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py index 6013a378..48b2ffc3 100644 --- a/python/dolma/core/parallel.py +++ b/python/dolma/core/parallel.py @@ -70,6 +70,7 @@ def __init__( seed: int = 0, pbar_timeout: float = 1e-3, ignore_existing: bool = False, + skip_source_glob: bool = False, include_paths: Optional[List[str]] = None, exclude_paths: Optional[List[str]] = None, files_regex_pattern: Optional[str] = None, @@ -95,6 +96,7 @@ def __init__( seed (int, optional): The random seed to use when shuffling input files. Defaults to 0. pbar_timeout (float, optional): How often to update progress bars in seconds. Defaults to 0.01 seconds. + skip_source_glob (bool, optional): Do not glob source files. Off by default. ignore_existing (bool, optional): Whether to ignore files that have been already processed and re-run the processor on all files from scratch. Defaults to False. include_paths (Optional[List[str]], optional): A list of paths to include. If provided, only files @@ -169,6 +171,8 @@ def __init__( if len(self.src_prefixes) == 0: raise ValueError("At least one source prefix must be provided.") + self.skip_source_glob = skip_source_glob + if any("*" in p for p in itertools.chain(self.dst_prefixes, self.meta_prefixes)): raise ValueError("Destination and metadata prefixes cannot contain wildcards.") @@ -468,7 +472,9 @@ def _get_all_paths(self) -> AllPathsTuple: for src_prefix, dst_prefix, meta_prefix, kwargs_prefix in zip( self.src_prefixes, self.dst_prefixes, self.meta_prefixes, self.process_single_kwargs ): - current_source_prefixes = sorted(glob_path(src_prefix)) + current_source_prefixes = sorted( + [src_prefix] if self.skip_source_glob else glob_path(src_prefix) + ) if len(current_source_prefixes) > 1: # make relative only makes sense if there is more than one path; otherwise, it's unclear diff --git a/python/dolma/core/paths.py b/python/dolma/core/paths.py index ba597e13..f3f048cf 100644 --- a/python/dolma/core/paths.py +++ b/python/dolma/core/paths.py @@ -223,9 +223,14 @@ def glob_path( protocol, parsed_path = _pathify(path) fs = _get_fs(path) - if fs.isdir(path) and autoglob_dirs: + if autoglob_dirs and fs.isdir(path): path = join_path(protocol, _unescape_glob(parsed_path), "*") + if '*' not in str(path): + # nothing to glob + yield str(path) + return + for gl in fs.glob(path): gl = str(gl) diff --git a/python/dolma/warc/processor.py b/python/dolma/warc/processor.py index aaf76a76..10a8630d 100644 --- a/python/dolma/warc/processor.py +++ b/python/dolma/warc/processor.py @@ -176,7 +176,7 @@ def _process_single_without_backoff( if record.http_charset: try: decoded_content = content.decode(record.http_charset).strip() - except (UnicodeDecodeError, LookupError): + except (UnicodeDecodeError, LookupError, UnicodeError): decoded_content = "" if not decoded_content and (encoding := detect(content)["encoding"]): decoded_content = content.decode(str(encoding)).strip() @@ -266,6 +266,7 @@ def create_and_run_warc_pipeline( store_html_in_metadata: bool = False, skip_no_pre_taggers: bool = False, skip_no_post_taggers: bool = False, + skip_source_glob: bool = False, ): with ExitStack() as stack: if metadata is None: @@ -280,7 +281,8 @@ def create_and_run_warc_pipeline( if isinstance(destination, str) and isinstance(metadata, str): for src_pattern in [documents] if isinstance(documents, str) else documents: - all_src_paths.extend(list(glob_path(src_pattern))) + all_src_paths.extend([src_pattern] if skip_source_glob else list(glob_path(src_pattern))) + all_dst_paths.extend(_make_paths_from_prefix(paths=all_src_paths, prefix=destination)) all_meta_paths.extend(_make_paths_from_prefix(paths=all_src_paths, prefix=metadata)) @@ -295,7 +297,7 @@ def create_and_run_warc_pipeline( raise ValueError("metadata and destination must have the same length") for src_pattern, dst_pattern, meta_pattern in zip(documents, destination, metadata): - src_paths = list(glob_path(src_pattern)) + src_paths = [src_pattern] if skip_source_glob else list(glob_path(src_pattern)) all_src_paths.extend(src_paths) all_dst_paths.extend(_make_paths_from_prefix(paths=src_paths, prefix=dst_pattern)) all_meta_paths.extend(_make_paths_from_prefix(paths=src_paths, prefix=meta_pattern)) @@ -308,6 +310,7 @@ def create_and_run_warc_pipeline( metadata_prefix=all_meta_paths, debug=debug, seed=seed, + skip_source_glob=skip_source_glob, ignore_existing=ignore_existing, retries_on_error=retries_on_error, num_processes=num_processes, diff --git a/python/dolma/warc/utils.py b/python/dolma/warc/utils.py index 8c550cd1..424b646b 100644 --- a/python/dolma/warc/utils.py +++ b/python/dolma/warc/utils.py @@ -32,17 +32,21 @@ def __init__(self): self.www_subdomain_regex = re.compile(r"(^(www\d*\.))|(/+$)", re.IGNORECASE) def __call__(self, url: str) -> str: - # remove leading '<' or quotes and trailing '>', quotes, or slashes - clean_url = re.sub(r"(^['\"<]+)|([/'\">]+$)", "", url) + try: + # remove leading '<' or quotes and trailing '>', quotes, or slashes + clean_url = re.sub(r"(^['\"<]+)|([/'\">]+$)", "", url) - # canonicalize the URL - canonical = canonicalize_url(clean_url) - normalized = str(url_normalize(canonical)) + # canonicalize the URL + canonical = canonicalize_url(clean_url) - # remove the protocol - _, normalized = normalized.split("://", 1) + normalized = str(url_normalize(canonical)) - # remove the www subdomain - normalized = self.www_subdomain_regex.sub("", normalized) + # remove the protocol + _, normalized = normalized.split("://", 1) + + # remove the www subdomain + normalized = self.www_subdomain_regex.sub("", normalized) + except UnicodeError: + normalized = "" return normalized From 289d9733f398658447007df70bff8ab7189d30cf Mon Sep 17 00:00:00 2001 From: Luca Soldaini Date: Sat, 11 May 2024 22:02:46 +0000 Subject: [PATCH 016/169] speedup --- pyproject.toml | 5 +- python/dolma/cli/warc.py | 5 +- python/dolma/core/parallel.py | 4 +- python/dolma/core/paths.py | 2 +- python/dolma/core/utils.py | 18 +- python/dolma/taggers/science.py | 612 +++++++++++++++++++++---------- python/dolma/warc/linearizers.py | 39 +- python/dolma/warc/processor.py | 60 +-- 8 files changed, 499 insertions(+), 246 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 746cd885..675c6751 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,19 +18,20 @@ dependencies = [ "omegaconf>=2.3.0", # "pycld2==0.41", # "pycld3==0.22", # does not install correctly - "acora>=2.4", + "hyperscan>=0.7.7", "platformdirs>=4.2.0", "pyyaml", "requests", "rich", "s3fs>=2023.6.0", - "smart-open", + "smart-open>=7.0.4", "tokenizers>=0.15.0,<1.0.0", "tqdm", "uniseg", "numpy", "necessary>=0.4.3", "charset-normalizer>=3.2.0", + "zstandard>=0.20.0", ] classifiers = [ "Development Status :: 5 - Production/Stable", diff --git a/python/dolma/cli/warc.py b/python/dolma/cli/warc.py index fe6f8957..e85d5384 100644 --- a/python/dolma/cli/warc.py +++ b/python/dolma/cli/warc.py @@ -65,7 +65,10 @@ class WarcExtractorConfig: default=False, help="If true, only print the configuration and exit without running the pipieline.", ) - skip_checks: bool = field(default=False, help="If true, skip checks on paths (e.g. validation, globbing). Useful in case many paths are being evaluated.") + skip_checks: bool = field( + default=False, + help="If true, skip checks on paths (e.g. validation, globbing). Useful in case many paths are being evaluated.", + ) class WarcExtractorCli(BaseCli): diff --git a/python/dolma/core/parallel.py b/python/dolma/core/parallel.py index 48b2ffc3..3d2449d5 100644 --- a/python/dolma/core/parallel.py +++ b/python/dolma/core/parallel.py @@ -472,9 +472,7 @@ def _get_all_paths(self) -> AllPathsTuple: for src_prefix, dst_prefix, meta_prefix, kwargs_prefix in zip( self.src_prefixes, self.dst_prefixes, self.meta_prefixes, self.process_single_kwargs ): - current_source_prefixes = sorted( - [src_prefix] if self.skip_source_glob else glob_path(src_prefix) - ) + current_source_prefixes = sorted([src_prefix] if self.skip_source_glob else glob_path(src_prefix)) if len(current_source_prefixes) > 1: # make relative only makes sense if there is more than one path; otherwise, it's unclear diff --git a/python/dolma/core/paths.py b/python/dolma/core/paths.py index f3f048cf..e0eda48d 100644 --- a/python/dolma/core/paths.py +++ b/python/dolma/core/paths.py @@ -226,7 +226,7 @@ def glob_path( if autoglob_dirs and fs.isdir(path): path = join_path(protocol, _unescape_glob(parsed_path), "*") - if '*' not in str(path): + if "*" not in str(path): # nothing to glob yield str(path) return diff --git a/python/dolma/core/utils.py b/python/dolma/core/utils.py index 2f5c5eb6..9b731109 100644 --- a/python/dolma/core/utils.py +++ b/python/dolma/core/utils.py @@ -3,17 +3,11 @@ import re import string import sys -from typing import List, Union, cast - -try: - import blingfire - - BLINGFIRE_AVAILABLE = True -except Exception: - BLINGFIRE_AVAILABLE = False +from typing import TYPE_CHECKING, List, Union, cast import nltk import uniseg.wordbreak +from necessary import necessary from nltk.tokenize.punkt import PunktSentenceTokenizer from omegaconf import OmegaConf as om @@ -22,10 +16,14 @@ except LookupError: nltk.download("punkt") - from .data_types import TextSlice from .loggers import get_logger +with necessary("blingfire", soft=True) as BLINGFIRE_AVAILABLE: + if BLINGFIRE_AVAILABLE or TYPE_CHECKING: + import blingfire + + sent_tokenizer = PunktSentenceTokenizer() logger = get_logger(__name__) @@ -134,7 +132,7 @@ def import_modules(modules_path: Union[List[str], None]): sys.path.insert(0, module_parent) importlib.import_module(module_name) elif module_path in sys.modules[module_name].__path__: - logger.info(f"{module_path} has already been imported.") + logger.info("%s has already been imported.", module_path) else: raise ImportError( f"Failed to import {module_path} because the corresponding module name " diff --git a/python/dolma/taggers/science.py b/python/dolma/taggers/science.py index c8dd075b..8ea9a5c6 100644 --- a/python/dolma/taggers/science.py +++ b/python/dolma/taggers/science.py @@ -1,6 +1,15 @@ -from typing import List, Optional +from typing import TYPE_CHECKING, Any, List, Optional -from acora import AcoraBuilder +from necessary import necessary + +with necessary("acora", soft=True) as ACORA_AVAILABLE: + if TYPE_CHECKING or ACORA_AVAILABLE: + from acora import AcoraBuilder + + +with necessary("hyperscan", soft=True) as HYPERSCAN_AVAILABLE: + if TYPE_CHECKING or HYPERSCAN_AVAILABLE: + from hyperscan import Database from ..core.data_types import DocResult, DocumentWithMetadata, Span from ..core.registry import TaggerRegistry @@ -8,49 +17,47 @@ class BaseHTMLKeywordLookupTagger(BaseTaggerWithMetadata): - KEYWORDS: List[str] + KEYWORDS: List[bytes] TYPE: str def __init__(self): + assert ACORA_AVAILABLE, "Acora is not available; please install with `pip install acora`." + builder = AcoraBuilder() builder.update(self.KEYWORDS) self.acora = builder.build() - def _get_content(self, doc: DocumentWithMetadata) -> str: - html: Optional[str] = doc.metadata.get("html", None) + def _get_content(self, doc: DocumentWithMetadata) -> bytes: + html: Optional[bytes] = doc.metadata.get("html", None) if html is None: raise ValueError("Cannot find `html` key in metadata.") return html def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore content = self._get_content(doc) - spans = [ - Span( - start=(start := match[1]), - end=(end := match[1] + len(match[0])), - type=self.TYPE, - score=(end - start), - ) - for match in self.acora.finditer(content) - ] - return DocResult(doc=doc, spans=spans) + # check if there's a match; if yes, return immediately + for _ in self.acora.finditer(content): + return DocResult(doc=doc, spans=[Span(start=0, end=len(content), type=self.TYPE, score=1)]) + + # if no match, return empty spans + return DocResult(doc=doc, spans=[]) @TaggerRegistry.add("owm_math_v1") class OpenWebMathContainsMathTagger(BaseHTMLKeywordLookupTagger): TYPE = "math" KEYWORDS = [ - "MathJax", - "mathjax", - " bytes: + html: Optional[bytes] = doc.metadata.get("html", None) + if html is None: + raise ValueError("Cannot find `html` key in metadata.") + return html + + @staticmethod + def _on_match(id_: int, from_: int, to: int, flags: int, context: Optional[Any] = None) -> None: + if context is not None: + context.append((id_, from_, to, flags)) + + def predict(self, doc: DocumentWithMetadata) -> DocResult: # type: ignore + content = self._get_content(doc) + + context: List[tuple] = [] + self.db.scan(content, match_event_handler=self._on_match, context=context) + if context: + return DocResult(doc=doc, spans=[Span(start=0, end=len(content), type=self.TYPE, score=1)]) + + # if no match, return empty spans + return DocResult(doc=doc, spans=[]) + + +@TaggerRegistry.add("owm_math_v2") +class HyperscanOpenWebMathContainsMathTagger(HyperscanHTMLKeywordLookupTagger): + TYPE = "math" + KEYWORDS = [ + rb"MathJax", + rb"mathjax", + rb" str: + def linearize(self, content: bytes, encoding: Optional[str] = None) -> str: pass @@ -58,8 +65,8 @@ def __init__( self.comments = comments self.skip_elements = skip_elements - def linearize(self, content: Union[str, bytes]) -> str: - # html (HTMLTree or str) – HTML as DOM tree or Unicode string + def linearize(self, content: bytes, encoding: Optional[str] = None) -> str: + # html (HTMLTree or str) – HTML as DOM tree or Unicode string # preserve_formatting (bool) – preserve basic block-level formatting # main_content (bool) – apply simple heuristics for extracting only “main-content” elements # list_bullets (bool) – insert bullets / numbers for list items @@ -69,11 +76,7 @@ def linearize(self, content: Union[str, bytes]) -> str: # noscript (bool) – extract contents of