Merge remote-tracking branch 'origin/main'

dhdaines · Jul 25, 2024 · 8cbaf4f · 8cbaf4f
2 parents 52d9775 + a91016b
commit 8cbaf4f
Show file tree

Hide file tree

Showing 10 changed files with 2,308 additions and 11 deletions.
diff --git a/.github/workflows/analyse.yml b/.github/workflows/analyse.yml
@@ -55,16 +55,20 @@ jobs:
         alexi -v download -u https://vsadm.ca/citoyens/reglementation/reglementation-durbanisme/ -o download/vsadm --all-pdf-links
         alexi -v download -u https://www.vss.ca/services-aux-citoyens/services/reglementation-durbanisme/ \
               -o download/vss --all-pdf-links -x '[Aa]nnexe'
+        alexi -v download -u https://www.ville.prevost.qc.ca/guichet-citoyen/services/urbanisme/ \
+              -o download/prevost --all-pdf-links -x Annexe -x Formulaires -x PUMD -x PMAD
     - name: Extract
       run: |
         alexi -v extract -m download/index.json download/*.pdf
         alexi -v extract -m download/vsadm/index.json -o export/vsadm download/vsadm/*.pdf
         alexi -v extract -m download/vss/index.json -o export/vss download/vss/*.pdf
+        alexi -v extract -m download/prevost/index.json -o export/prevost download/prevost/*.pdf
     - name: Index
       run: |
         alexi -v index -o export/_idx export
         alexi -v index -o export/vsadm/_idx export/vsadm
         alexi -v index -o export/vss/_idx export/vss
+        alexi -v index -o export/prevost/_idx export/prevost
     - name: Setup Pages
       uses: actions/configure-pages@v5
     - name: Upload artifact

diff --git a/alexi/analyse.py b/alexi/analyse.py
@@ -237,7 +237,7 @@ class Document:
     """Document avec blocs de texte et structure."""
 
     fileid: str
-    meta: dict[str, str]
+    meta: dict[str, list[str]]
     paliers: dict[str, list[Element]]
     contenu: list[Bloc]
     unknown_id: int = 0
@@ -417,11 +417,10 @@ def __init__(self, fileid: str, words: Iterable[T_obj]):
         self.fileid = fileid
         self.words: list[T_obj] = list(words)
         self.blocs: list[Bloc] = list(group_iob(self.words, "segment"))
-        self.metadata: dict[str, str] = {}
+        self.metadata: dict[str, list[str]] = {}
         for bloc in group_iob(self.words, "sequence"):
-            if bloc.type not in self.metadata:
-                LOGGER.info(f"{bloc.type}: {bloc.texte}")
-                self.metadata[bloc.type] = bloc.texte
+            LOGGER.info(f"sequence {bloc.type}: {bloc.texte}")
+            self.metadata.setdefault(bloc.type, []).append(bloc.texte)
 
     def add_images(self, images: Iterable[Bloc], merge: bool = True):
         """Insérer les images en les fusionnant avec le texte (et entre elles)
@@ -459,8 +458,8 @@ def __call__(
         blocs: Optional[Iterable[Bloc]] = None,
     ) -> Document:
         """Analyse du structure d'un document."""
-        titre = self.metadata.get("Titre", "Document")
-        numero = self.metadata.get("Numero", "")
+        titre = self.metadata.get("Titre", ["Document"])[0]
+        numero = self.metadata.get("Numero", [""])[0]
         if m := re.search(r"(?i:num[ée]ro)\s+([0-9][A-Z0-9-]+)", titre):
             LOGGER.info("Numéro extrait du titre: %s", m.group(1))
             numero = m.group(1)

diff --git a/alexi/convert.py b/alexi/convert.py
@@ -82,6 +82,10 @@ def get_rgb(c: T_obj) -> str:
         r = g = b = couleur[0]
     elif len(couleur) == 3:
         r, g, b = couleur
+    elif len(couleur) == 4:
+        return "CMYK#" + "".join(
+            ("%x" % int(min(0.999, val) * 16) for val in (couleur))
+        )
     else:
         LOGGER.warning("Espace couleur non pris en charge: %s", couleur)
         return "#000"

diff --git a/alexi/extract.py b/alexi/extract.py
@@ -12,6 +12,8 @@
 from pathlib import Path
 from typing import Any, Iterable, Optional, TextIO
 
+from natsort import natsorted
+
 from alexi.analyse import Analyseur, Bloc, Document, Element, extract_zonage
 from alexi.convert import Converteur
 from alexi.format import HtmlFormatter
@@ -301,17 +303,24 @@ def make_doc_tree(docs: list[Document], outdir: Path) -> dict[str, dict[str, str
 </html>
 """
     metadata = {}
-    docs.sort(key=operator.attrgetter("numero"))
+
+    def doc_sort_key(doc):
+        if doc.numero != "":
+            return f"{doc.numero}: {doc.titre}"
+        elif doc.fileid.startswith("RUD_T"):  # FIXME: Very special case here
+            return f"843: Règlement d’urbanisme durable: {doc.titre}"
+        else:
+            return doc.titre
+
+    docs = natsorted(docs, key=doc_sort_key)
     with open(outdir / "index.html", "wt") as outfh:
         LOGGER.info("Génération de %s", outdir / "index.html")
         outfh.write(HTML_HEADER)
         for doc in docs:
             outfh.write('<li class="Document node"><details>\n')
             # Make fragment links to this ID expand the document (as
             # we usually do not want to link to the full text)
-            outfh.write(
-                f'<summary id="{doc.fileid}">{doc.numero}: {doc.titre}</summary>\n'
-            )
+            outfh.write(f'<summary id="{doc.fileid}">{doc_sort_key(doc)}</summary>\n')
             make_doc_subtree(doc, outfh)
             outfh.write("</details></li>\n")
             doc_metadata = {
@@ -350,6 +359,9 @@ def __init__(
         if metadata:
             with open(metadata, "rt") as infh:
                 self.pdfdata = json.load(infh)
+                for key in list(self.pdfdata.keys()):
+                    if "%20" in key:
+                        self.pdfdata[key.replace("%20", " ")] = self.pdfdata[key]
         else:
             self.pdfdata = {}
         self.metadata = {"pdfs": self.pdfdata}