Skip to content

Commit

Permalink
Merge pull request #51 from Legilibre/sommaires
Browse files Browse the repository at this point in the history
Cette branche normalise le contenu de la colonne `sommaires.num` et ajoute celle-ci à un index afin d’accélérer la résolution d'une référence à un élément d'un texte (#4).

Pour les sections la valeur stockée dans `sommaires.num` n'est que la "première moitié" de `sections.titre_ta`, c'est à dire le niveau et le numéro de la section, mais pas son objet. Concrètement pour une section dont le titre complet est `Titre 1er : Dispositions générales`, seul `Titre 1er` est stocké dans `sommaires.num`.
  • Loading branch information
Changaco authored Jan 12, 2019
2 parents 7dacb05 + ab29406 commit 00a7b20
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 5 deletions.
49 changes: 46 additions & 3 deletions legi/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from .html import bad_space_re, drop_bad_space, split_first_paragraph
from .roman import ROMAN_PATTERN as roman_num
from .sections import (
legifrance_url_section, normalize_section_num, section_re, section_type_p,
sujet_re,
legifrance_url_section, normalize_section_num, reduce_section_title,
section_re, section_type_p, sujet_re,
)
from .titles import NATURE_MAP_R_SD, gen_titre, normalize_title, parse_titre
from .utils import (
Expand Down Expand Up @@ -602,6 +602,47 @@ def clean_num_match(m):
log_file.write('%r => %r (%i×)\n' % (change[0], change[1], count))


def normalize_sommaires_num(db, dry_run=False, log_file=None):
print("> Normalisation des numéros dans les sommaires...")

counts = {}

db.run("""
UPDATE sommaires AS so
SET num = (
SELECT a.num
FROM articles a
WHERE a.id = so.element
)
WHERE substr(so.element, 5, 4) = 'ARTI'
AND COALESCE(so.num, '') <> (
SELECT COALESCE(a.num, '')
FROM articles a
WHERE a.id = so.element
)
""")
counts['updated num for article'] = db.changes()

db.create_function('reduce_section_title', 1, reduce_section_title)
db.run("""
UPDATE sommaires AS so
SET num = (
SELECT reduce_section_title(s.titre_ta)
FROM sections s
WHERE s.id = so.element
)
WHERE substr(so.element, 5, 4) = 'SCTA'
AND COALESCE(so.num, '') <> (
SELECT reduce_section_title(s.titre_ta)
FROM sections s
WHERE s.id = so.element
)
""")
counts['updated num for section'] = db.changes()

print("Done. Result: " + json.dumps(counts, indent=4, sort_keys=True))


def normalize_text_titles(db, dry_run=False, log_file=None):
print("> Normalisation des titres des textes...")

Expand Down Expand Up @@ -821,7 +862,7 @@ def get_key(key, ignore_not_found=False):
p = ArgumentParser()
p.add_argument('db')
p.add_argument('what', nargs='?', default='all', choices=[
'all', 'articles_num', 'sections_titres', 'textes_titres'
'all', 'articles_num', 'sections_titres', 'sommaires_num', 'textes_titres'
])
p.add_argument('--dry-run', action='store_true', default=False)
p.add_argument('--log-path', default='/dev/null')
Expand All @@ -837,6 +878,8 @@ def get_key(key, ignore_not_found=False):
normalize_section_titles(db, dry_run=args.dry_run, log_file=log_file)
if args.what in ('all', 'articles_num'):
normalize_article_numbers(db, dry_run=args.dry_run, log_file=log_file)
if args.what in ('all', 'sommaires_num'):
normalize_sommaires_num(db, dry_run=args.dry_run, log_file=log_file)
if args.dry_run:
raise KeyboardInterrupt
except KeyboardInterrupt:
Expand Down
17 changes: 17 additions & 0 deletions legi/sections.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,20 @@ def normalize_section_num(num):
if num_sd in SPECIAL_NUMS_MAP:
num = SPECIAL_NUMS_MAP[num_sd]
return num


def reduce_section_title(titre_ta):
"""Reduce a section title to its "first half".
>>> reduce_section_title("Titre 1er: Dispositions générales")
'Titre 1er'
>>> reduce_section_title("Première partie")
'Première partie'
>>> print(reduce_section_title("Dispositions finales"))
None
This function assumes that the section title has already been normalized.
"""
m = section_re.match(titre_ta)
if m and (m.end() == len(titre_ta) or sujet_re.match(titre_ta, m.end())):
return m.group(0).rstrip('.°')
4 changes: 4 additions & 0 deletions legi/sql/migrations.sql
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,7 @@ CREATE VIEW textes_versions_brutes_view AS
FROM textes_versions a
LEFT JOIN textes_versions_brutes b
ON b.id = a.id AND b.cid = a.cid AND b.dossier = a.dossier AND b.mtime = a.mtime;

-- migration #5
DROP INDEX sommaires_cid_idx;
CREATE INDEX sommaires_cid_num_idx ON sommaires (cid, num);
4 changes: 2 additions & 2 deletions legi/sql/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ CREATE TABLE db_meta
, value blob
);

INSERT INTO db_meta (key, value) VALUES ('schema_version', 3);
INSERT INTO db_meta (key, value) VALUES ('schema_version', 5);

CREATE TABLE textes
( id integer primary key not null
Expand Down Expand Up @@ -95,7 +95,7 @@ CREATE TABLE sommaires
, _source text -- to support incremental updates
);

CREATE INDEX sommaires_cid_idx ON sommaires (cid);
CREATE INDEX sommaires_cid_idx ON sommaires (cid, num);

CREATE TABLE liens
( src_id char(20) not null
Expand Down

0 comments on commit 00a7b20

Please sign in to comment.