diff --git a/changelog/2.1.0.md b/changelog/2.1.0.md new file mode 100644 index 0000000..8f2eb27 --- /dev/null +++ b/changelog/2.1.0.md @@ -0,0 +1,45 @@ +# Version 2.1.0 - Information Systems +## Version notes + +The main purpose of this minor release is to generate a new reference build for a reproducibility paper submitted to [Information Systems](https://www.journals.elsevier.com/information-systems). + +Some major changes include: + +- **Judge mapping*: the judges name are not just parsed but cleaned and mapped using the official list of judges provided by the European Court of Human Rights. This makes the network analysis of judges votes more reliable. +- **Tables**: Tables in judgements were ignored by the parser. They are now attached to the judgement in dedicated field. +- **Deployment, CD/CI, code quality**: We focused on producing better code, fully end-to-end and with a fully automated deployment. The build is now automatically generated and updated every month. +- **Reproducilibity**: It is now possible to input a list of cases to be certain to reproduce perfectly a build. + +## Issues and enhancements + +- [#187](https://github.com/echr-od/ECHR-OD_process/issues/187) Move the server to Poznan University of Technology +- [#182](https://github.com/echr-od/ECHR-OD_process/issues/182) Could not normalize the tokens +- [#178](https://github.com/echr-od/ECHR-OD_process/issues/178) Proper Judge mapping +- [#163](https://github.com/echr-od/ECHR-OD_process/issues/163) Incorrect parsing of judge name data quality (fixed by 178) +- [#176](https://github.com/echr-od/ECHR-OD_process/issues/176) Case 001-177299 has wrongly parsed Judges data quality +- [#170](https://github.com/echr-od/ECHR-OD_process/issues/170) Incorrect logic in get_documens step bug +- [#161](https://github.com/echr-od/ECHR-OD_process/issues/161) Add Pawel to the contributor list +- [#158](https://github.com/echr-od/ECHR-OD_process/issues/158) Open a Matrix/Element chanel for discussions enhancement +- [#154](https://github.com/echr-od/ECHR-OD_process/issues/154) Cyclic release is broken because of shlex.quote bug +- [#153](https://github.com/echr-od/ECHR-OD_process/issues/153) IndexError when deploy step is called without parameters bug +- [#152](https://github.com/echr-od/ECHR-OD_process/issues/152) Parsing texts with special characters bug data quality +- [#151](https://github.com/echr-od/ECHR-OD_process/issues/151) Parsing of tables data quality enhancement +- [#149](https://github.com/echr-od/ECHR-OD_process/issues/149) One case has an invalid label .5 bug +- [#125](https://github.com/echr-od/ECHR-OD_process/issues/125) Conclusion parser assigns wrong label for Protocol bug data quality +- [#121](https://github.com/echr-od/ECHR-OD_process/issues/121) Automatically trigger ECHR-OD Explorer update at the end of runner workflow enhancement +- [#118](https://github.com/echr-od/ECHR-OD_process/issues/118) Build name should be passed to the deployment script enhancement +- [#116](https://github.com/echr-od/ECHR-OD_process/issues/116) Build info should use write more rather than append mode bug +- [#114](https://github.com/echr-od/ECHR-OD_process/issues/114) Add basic build manifest into the build folder +- [#111](https://github.com/echr-od/ECHR-OD_process/issues/111) Lower memory footprint in database preparation +- [#110](https://github.com/echr-od/ECHR-OD_process/issues/110) Improve code quality according to Codacy +- [#106](https://github.com/echr-od/ECHR-OD_process/issues/106) Lock file and build history for automated update of ECHR Explorer +- [#97](https://github.com/echr-od/ECHR-OD_process/issues/97) Update a build through a particular list of cases +- [#96](https://github.com/echr-od/ECHR-OD_process/issues/96) Separate the main database workflow from the dataset/language model creation +- [#93](https://github.com/echr-od/ECHR-OD_process/issues/93) Better organization of the build files enhancement +- [#78](https://github.com/echr-od/ECHR-OD_process/issues/78) Removing Selenium to get the total number of cases enhancement +- [#77](https://github.com/echr-od/ECHR-OD_process/issues/77) Update a given build with documents from a given range of time enhancement +- [#74](https://github.com/echr-od/ECHR-OD_process/issues/74) Fix linter error bug +- [#54](https://github.com/echr-od/ECHR-OD_process/issues/54) Deployment tool enhancement +- [#53](https://github.com/echr-od/ECHR-OD_process/issues/53) Continuous Integration enhancement +- [#52](https://github.com/echr-od/ECHR-OD_process/issues/52) Containerize the process enhancement + diff --git a/config.yml b/config.yml index 361d063..38b615f 100644 --- a/config.yml +++ b/config.yml @@ -5,14 +5,14 @@ logging: steps: normalize: ngrams: - 1: 5 - 2: 3 - 3: 2 - 4: 2 + 1: 1 + 2: 1 + 3: 1 + 4: 1 build: env: - LIMIT_TOKENS: 10000 + LIMIT_TOKENS: 5000 OSF_PARAMS: 'change me' SRV_PARAMS: 'change me' diff --git a/echr/steps/generate_datasets.py b/echr/steps/generate_datasets.py index 304bf18..76d6f1d 100644 --- a/echr/steps/generate_datasets.py +++ b/echr/steps/generate_datasets.py @@ -34,9 +34,9 @@ def generate_dataset(cases, keys, keys_list, encoded_outcomes, feature_index, fe classes = [] for e in c[conclusion_key]: if e['type'] in ['violation', 'no-violation']: - if 'article' in e and e['article'] in encoded_outcomes: - g = encoded_outcomes[e['article']] - if filter_classes is None or e['article'] in filter_classes: + if 'base_article' in e and e['base_article'] in encoded_outcomes: + g = encoded_outcomes[e['base_article']] + if filter_classes is None or e['base_article'] in filter_classes: classes.append('{}:{}'.format(g, 1 if e['type'] == 'violation' else 0)) classes = list(set(classes)) opposed_classes = any( @@ -46,11 +46,11 @@ def generate_dataset(cases, keys, keys_list, encoded_outcomes, feature_index, fe f.write(' '.join(classes) + '\n') for e in c[conclusion_key]: if e['type'] in ['violation', 'no-violation']: - if 'article' in e and e['article'] in encoded_outcomes: - if filter_classes is None or e['article'] in filter_classes: - if e['article'] not in outcome_distribution: - outcome_distribution[e['article']] = {'violation': 0, 'no-violation': 0} - outcome_distribution[e['article']][e['type']] += 1 + if 'base_article' in e and e['base_article'] in encoded_outcomes: + if filter_classes is None or e['base_article'] in filter_classes: + if e['base_article'] not in outcome_distribution: + outcome_distribution[e['base_article']] = {'violation': 0, 'no-violation': 0} + outcome_distribution[e['base_article']][e['type']] += 1 if name != 'multilabel': break for k, v in c.items():