Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
dhdaines committed Jul 25, 2024
2 parents 52d9775 + a91016b commit 8cbaf4f
Show file tree
Hide file tree
Showing 10 changed files with 2,308 additions and 11 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/analyse.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,20 @@ jobs:
alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/ -o download/vsadm --all-pdf-links
alexi -v download -u https://www.vss.ca/services-aux-citoyens/services/reglementation-durbanisme/ \
-o download/vss --all-pdf-links -x '[Aa]nnexe'
alexi -v download -u https://www.ville.prevost.qc.ca/guichet-citoyen/services/urbanisme/ \
-o download/prevost --all-pdf-links -x Annexe -x Formulaires -x PUMD -x PMAD
- name: Extract
run: |
alexi -v extract -m download/index.json download/*.pdf
alexi -v extract -m download/vsadm/index.json -o export/vsadm download/vsadm/*.pdf
alexi -v extract -m download/vss/index.json -o export/vss download/vss/*.pdf
alexi -v extract -m download/prevost/index.json -o export/prevost download/prevost/*.pdf
- name: Index
run: |
alexi -v index -o export/_idx export
alexi -v index -o export/vsadm/_idx export/vsadm
alexi -v index -o export/vss/_idx export/vss
alexi -v index -o export/prevost/_idx export/prevost
- name: Setup Pages
uses: actions/configure-pages@v5
- name: Upload artifact
Expand Down
13 changes: 6 additions & 7 deletions alexi/analyse.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ class Document:
"""Document avec blocs de texte et structure."""

fileid: str
meta: dict[str, str]
meta: dict[str, list[str]]
paliers: dict[str, list[Element]]
contenu: list[Bloc]
unknown_id: int = 0
Expand Down Expand Up @@ -417,11 +417,10 @@ def __init__(self, fileid: str, words: Iterable[T_obj]):
self.fileid = fileid
self.words: list[T_obj] = list(words)
self.blocs: list[Bloc] = list(group_iob(self.words, "segment"))
self.metadata: dict[str, str] = {}
self.metadata: dict[str, list[str]] = {}
for bloc in group_iob(self.words, "sequence"):
if bloc.type not in self.metadata:
LOGGER.info(f"{bloc.type}: {bloc.texte}")
self.metadata[bloc.type] = bloc.texte
LOGGER.info(f"sequence {bloc.type}: {bloc.texte}")
self.metadata.setdefault(bloc.type, []).append(bloc.texte)

def add_images(self, images: Iterable[Bloc], merge: bool = True):
"""Insérer les images en les fusionnant avec le texte (et entre elles)
Expand Down Expand Up @@ -459,8 +458,8 @@ def __call__(
blocs: Optional[Iterable[Bloc]] = None,
) -> Document:
"""Analyse du structure d'un document."""
titre = self.metadata.get("Titre", "Document")
numero = self.metadata.get("Numero", "")
titre = self.metadata.get("Titre", ["Document"])[0]
numero = self.metadata.get("Numero", [""])[0]
if m := re.search(r"(?i:num[ée]ro)\s+([0-9][A-Z0-9-]+)", titre):
LOGGER.info("Numéro extrait du titre: %s", m.group(1))
numero = m.group(1)
Expand Down
4 changes: 4 additions & 0 deletions alexi/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,10 @@ def get_rgb(c: T_obj) -> str:
r = g = b = couleur[0]
elif len(couleur) == 3:
r, g, b = couleur
elif len(couleur) == 4:
return "CMYK#" + "".join(
("%x" % int(min(0.999, val) * 16) for val in (couleur))
)
else:
LOGGER.warning("Espace couleur non pris en charge: %s", couleur)
return "#000"
Expand Down
20 changes: 16 additions & 4 deletions alexi/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
from pathlib import Path
from typing import Any, Iterable, Optional, TextIO

from natsort import natsorted

from alexi.analyse import Analyseur, Bloc, Document, Element, extract_zonage
from alexi.convert import Converteur
from alexi.format import HtmlFormatter
Expand Down Expand Up @@ -301,17 +303,24 @@ def make_doc_tree(docs: list[Document], outdir: Path) -> dict[str, dict[str, str
</html>
"""
metadata = {}
docs.sort(key=operator.attrgetter("numero"))

def doc_sort_key(doc):
if doc.numero != "":
return f"{doc.numero}: {doc.titre}"
elif doc.fileid.startswith("RUD_T"): # FIXME: Very special case here
return f"843: Règlement d’urbanisme durable: {doc.titre}"
else:
return doc.titre

docs = natsorted(docs, key=doc_sort_key)
with open(outdir / "index.html", "wt") as outfh:
LOGGER.info("Génération de %s", outdir / "index.html")
outfh.write(HTML_HEADER)
for doc in docs:
outfh.write('<li class="Document node"><details>\n')
# Make fragment links to this ID expand the document (as
# we usually do not want to link to the full text)
outfh.write(
f'<summary id="{doc.fileid}">{doc.numero}: {doc.titre}</summary>\n'
)
outfh.write(f'<summary id="{doc.fileid}">{doc_sort_key(doc)}</summary>\n')
make_doc_subtree(doc, outfh)
outfh.write("</details></li>\n")
doc_metadata = {
Expand Down Expand Up @@ -350,6 +359,9 @@ def __init__(
if metadata:
with open(metadata, "rt") as infh:
self.pdfdata = json.load(infh)
for key in list(self.pdfdata.keys()):
if "%20" in key:
self.pdfdata[key.replace("%20", " ")] = self.pdfdata[key]
else:
self.pdfdata = {}
self.metadata = {"pdfs": self.pdfdata}
Expand Down
Loading

0 comments on commit 8cbaf4f

Please sign in to comment.