Merge pull request #51 from Legilibre/sommaires

Cette branche normalise le contenu de la colonne `sommaires.num` et ajoute celle-ci à un index afin d’accélérer la résolution d'une référence à un élément d'un texte (#4). Pour les sections la valeur stockée dans `sommaires.num` n'est que la "première moitié" de `sections.titre_ta`, c'est à dire le niveau et le numéro de la section, mais pas son objet. Concrètement pour une section dont le titre complet est `Titre 1er : Dispositions générales`, seul `Titre 1er` est stocké dans `sommaires.num`.
Legilibre · Jan 12, 2019 · 00a7b20 · 00a7b20
2 parents 7dacb05 + ab29406
commit 00a7b20
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 5 deletions.
diff --git a/legi/normalize.py b/legi/normalize.py
@@ -17,8 +17,8 @@
 from .html import bad_space_re, drop_bad_space, split_first_paragraph
 from .roman import ROMAN_PATTERN as roman_num
 from .sections import (
-    legifrance_url_section, normalize_section_num, section_re, section_type_p,
-    sujet_re,
+    legifrance_url_section, normalize_section_num, reduce_section_title,
+    section_re, section_type_p, sujet_re,
 )
 from .titles import NATURE_MAP_R_SD, gen_titre, normalize_title, parse_titre
 from .utils import (
@@ -602,6 +602,47 @@ def clean_num_match(m):
                 log_file.write('%r => %r (%i×)\n' % (change[0], change[1], count))
 
 
+def normalize_sommaires_num(db, dry_run=False, log_file=None):
+    print("> Normalisation des numéros dans les sommaires...")
+
+    counts = {}
+
+    db.run("""
+        UPDATE sommaires AS so
+           SET num = (
+                   SELECT a.num
+                     FROM articles a
+                    WHERE a.id = so.element
+               )
+         WHERE substr(so.element, 5, 4) = 'ARTI'
+           AND COALESCE(so.num, '') <> (
+                   SELECT COALESCE(a.num, '')
+                     FROM articles a
+                    WHERE a.id = so.element
+               )
+    """)
+    counts['updated num for article'] = db.changes()
+
+    db.create_function('reduce_section_title', 1, reduce_section_title)
+    db.run("""
+        UPDATE sommaires AS so
+           SET num = (
+                   SELECT reduce_section_title(s.titre_ta)
+                     FROM sections s
+                    WHERE s.id = so.element
+               )
+         WHERE substr(so.element, 5, 4) = 'SCTA'
+           AND COALESCE(so.num, '') <> (
+                   SELECT reduce_section_title(s.titre_ta)
+                     FROM sections s
+                    WHERE s.id = so.element
+               )
+    """)
+    counts['updated num for section'] = db.changes()
+
+    print("Done. Result: " + json.dumps(counts, indent=4, sort_keys=True))
+
+
 def normalize_text_titles(db, dry_run=False, log_file=None):
     print("> Normalisation des titres des textes...")
 
@@ -821,7 +862,7 @@ def get_key(key, ignore_not_found=False):
     p = ArgumentParser()
     p.add_argument('db')
     p.add_argument('what', nargs='?', default='all', choices=[
-        'all', 'articles_num', 'sections_titres', 'textes_titres'
+        'all', 'articles_num', 'sections_titres', 'sommaires_num', 'textes_titres'
     ])
     p.add_argument('--dry-run', action='store_true', default=False)
     p.add_argument('--log-path', default='/dev/null')
@@ -837,6 +878,8 @@ def get_key(key, ignore_not_found=False):
                 normalize_section_titles(db, dry_run=args.dry_run, log_file=log_file)
             if args.what in ('all', 'articles_num'):
                 normalize_article_numbers(db, dry_run=args.dry_run, log_file=log_file)
+            if args.what in ('all', 'sommaires_num'):
+                normalize_sommaires_num(db, dry_run=args.dry_run, log_file=log_file)
             if args.dry_run:
                 raise KeyboardInterrupt
     except KeyboardInterrupt:

diff --git a/legi/sections.py b/legi/sections.py
@@ -95,3 +95,20 @@ def normalize_section_num(num):
     if num_sd in SPECIAL_NUMS_MAP:
         num = SPECIAL_NUMS_MAP[num_sd]
     return num
+
+
+def reduce_section_title(titre_ta):
+    """Reduce a section title to its "first half".
+
+    >>> reduce_section_title("Titre 1er: Dispositions générales")
+    'Titre 1er'
+    >>> reduce_section_title("Première partie")
+    'Première partie'
+    >>> print(reduce_section_title("Dispositions finales"))
+    None
+
+    This function assumes that the section title has already been normalized.
+    """
+    m = section_re.match(titre_ta)
+    if m and (m.end() == len(titre_ta) or sujet_re.match(titre_ta, m.end())):
+        return m.group(0).rstrip('.°')
diff --git a/legi/sql/migrations.sql b/legi/sql/migrations.sql
@@ -46,3 +46,7 @@ CREATE VIEW textes_versions_brutes_view AS
       FROM textes_versions a
  LEFT JOIN textes_versions_brutes b
         ON b.id = a.id AND b.cid = a.cid AND b.dossier = a.dossier AND b.mtime = a.mtime;
+
+-- migration #5
+DROP INDEX sommaires_cid_idx;
+CREATE INDEX sommaires_cid_num_idx ON sommaires (cid, num);
diff --git a/legi/sql/schema.sql b/legi/sql/schema.sql
@@ -4,7 +4,7 @@ CREATE TABLE db_meta
 , value   blob
 );
 
-INSERT INTO db_meta (key, value) VALUES ('schema_version', 3);
+INSERT INTO db_meta (key, value) VALUES ('schema_version', 5);
 
 CREATE TABLE textes
 ( id            integer    primary key not null
@@ -95,7 +95,7 @@ CREATE TABLE sommaires
 , _source    text       -- to support incremental updates
 );
 
-CREATE INDEX sommaires_cid_idx ON sommaires (cid);
+CREATE INDEX sommaires_cid_idx ON sommaires (cid, num);
 
 CREATE TABLE liens
 ( src_id      char(20)   not null