From a7e4b9e4ae952538557c20c68ca1c52e79a922b8 Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Tue, 5 Nov 2024 18:55:51 +0100 Subject: [PATCH 01/27] First version of edit extraction schema --- pyproject.toml | 3 +- .../__init__.py | 15 + .../llm_extraction_schema.py | 479 ++++++++++++++++++ 3 files changed, 496 insertions(+), 1 deletion(-) create mode 100644 src/perovskite_solar_cell_database/llm_extraction_schema.py diff --git a/pyproject.toml b/pyproject.toml index e08493f..0dbd73f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -134,4 +134,5 @@ where = ["src"] perovskite_solar_cell = "perovskite_solar_cell_database:perovskite_solar_cell" solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" perovskite_composition = "perovskite_solar_cell_database:perovskite_composition" -ion_parser = "perovskite_solar_cell_database:ion_parser" \ No newline at end of file +ion_parser = "perovskite_solar_cell_database:ion_parser" +llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" \ No newline at end of file diff --git a/src/perovskite_solar_cell_database/__init__.py b/src/perovskite_solar_cell_database/__init__.py index f8e759f..182f035 100644 --- a/src/perovskite_solar_cell_database/__init__.py +++ b/src/perovskite_solar_cell_database/__init__.py @@ -66,3 +66,18 @@ def load(self): }, }, ) + + +class LLMSchemaExtractionPackageEntryPoint(SchemaPackageEntryPoint): + def load(self): + from perovskite_solar_cell_database.llm_extraction_schema import ( + m_package, + ) + + return m_package + + +llm_extraction_schema = LLMSchemaExtractionPackageEntryPoint( + name='LLMExtractionSchema', + description='Schema package defined for the perovskite solar cells database LLM extraction.', +) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py new file mode 100644 index 0000000..a567f52 --- /dev/null +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -0,0 +1,479 @@ +from typing import ( + TYPE_CHECKING, +) + +from nomad.datamodel.data import ArchiveSection +from nomad.datamodel.metainfo.basesections import PublicationReference +from nomad.datamodel.metainfo.eln import ELNAnnotation +from nomad.metainfo import JSON, Quantity, Section, SubSection +from nomad.metainfo.metainfo import MEnum + +if TYPE_CHECKING: + pass + +from nomad.datamodel.data import Schema +from nomad.metainfo import SchemaPackage + +m_package = SchemaPackage() + + +# LightSource class +class LightSource(ArchiveSection): + m_def = Section(label='Light Source') + + type = Quantity( + type=MEnum( + [ + 'AM 1.5G', + 'AM 1.5D', + 'AM 0', + 'Monochromatic', + 'White LED', + 'Other', + 'Outdoor', + ] + ), + description='Type of light source', + a_eln=ELNAnnotation(label='Light Source Type', component='EnumEditQuantity'), + ) + + description = Quantity( + type=str, + description='Additional details about the light source. This is very important.', + a_eln=ELNAnnotation(label='Description', component='StringEditQuantity'), + ) + + light_intensity = Quantity( + type=float, + unit='mW/cm**2', + description='Light intensity value', + a_eln=ELNAnnotation( + label='Light Intensity', + component='NumberEditQuantity', + defaultDisplayUnit='mW/cm**2', + props=dict(minValue=0), + ), + ) + + lamp = Quantity( + type=str, + description='Type of lamp used to generate the spectrum', + a_eln=ELNAnnotation(label='Lamp', component='StringEditQuantity'), + ) + + +# Stability class +class Stability(ArchiveSection): + time = Quantity( + type=float, + unit='hour', + description='Duration of the stability test', + a_eln=ELNAnnotation( + label='Time', defaultDisplayUnit='hour', component='NumberEditQuantity' + ), + ) + + light_intensity = Quantity( + type=float, + unit='mW/cm**2', + description='Light intensity during stability test', + a_eln=ELNAnnotation( + label='Light Intensity', + component='NumberEditQuantity', + defaultDisplayUnit='mW/cm**2', + props=dict(minValue=0), + ), + ) + + humidity = Quantity( + type=float, + description='Relative humidity during stability test', + a_eln=ELNAnnotation( + label='Humidity', + component='NumberEditQuantity', + props=dict(minValue=0, maxValue=100), + ), + ) + + temperature = Quantity( + type=float, + unit='°C', + description='Temperature during stability test', + a_eln=ELNAnnotation( + label='Temperature', defaultDisplayUnit='°C', component='NumberEditQuantity' + ), + ) + + PCE_T80 = Quantity( + type=float, + unit='hour', + description='Time after which the cell performance has degraded by 20%', + a_eln=ELNAnnotation( + label='PCE T80', defaultDisplayUnit='hour', component='NumberEditQuantity' + ), + ) + + PCE_at_start = Quantity( + type=float, + description='PCE at the start of the experiment', + a_eln=ELNAnnotation(label='PCE at Start', component='NumberEditQuantity'), + ) + + PCE_after_1000_hours = Quantity( + type=float, + description='PCE after 1000 hours', + a_eln=ELNAnnotation( + label='PCE after 1000 Hours', component='NumberEditQuantity' + ), + ) + + PCE_at_end = Quantity( + type=float, + description='PCE at the end of the experiment', + a_eln=ELNAnnotation(label='PCE at End', component='NumberEditQuantity'), + ) + + +# ProcessingAtmosphere class +class ProcessingAtmosphere(ArchiveSection): + m_def = Section(label='Processing Atmosphere') + + type = Quantity( + type=str, + description='Type of atmosphere', + a_eln=ELNAnnotation(label='Atmosphere Type', component='StringEditQuantity'), + ) + + pressure = Quantity( + type=float, + unit='mbar', + description='Pressure during processing', + a_eln=ELNAnnotation( + label='Pressure', defaultDisplayUnit='mbar', component='NumberEditQuantity' + ), + ) + + relative_humidity = Quantity( + type=float, + description='Relative humidity during processing', + a_eln=ELNAnnotation( + label='Relative Humidity', + component='NumberEditQuantity', + props=dict(minValue=0, maxValue=100), + ), + ) + + +# ReactionSolution class +class ReactionSolution(ArchiveSection): + m_def = Section(label='Reaction Solution') + + compounds = Quantity( + type=str, + shape=['*'], + description='List of compounds', + a_eln=ELNAnnotation(label='Compounds', component='StringEditQuantity'), + ) + + concentrations = Quantity( + type=float, + shape=['*'], + description='Concentrations of compounds', + a_eln=ELNAnnotation(label='Concentrations', component='NumberEditQuantity'), + ) + + concentrations_unit = Quantity( + type=str, + description='Unit of the concentrations', + a_eln=ELNAnnotation( + label='Concentrations Unit', component='StringEditQuantity' + ), + ) + + volume = Quantity( + type=float, + unit='L', + description='Volume of the solution', + a_eln=ELNAnnotation( + label='Volume', defaultDisplayUnit='L', component='NumberEditQuantity' + ), + ) + + temperature = Quantity( + type=float, + unit='°C', + description='Temperature of the solution', + a_eln=ELNAnnotation( + label='Temperature', defaultDisplayUnit='°C', component='NumberEditQuantity' + ), + ) + + solvent = Quantity( + type=str, + description='Solvent used', + a_eln=ELNAnnotation(label='Solvent', component='StringEditQuantity'), + ) + + +# ProcessingStep class +class ProcessingStep(ArchiveSection): + m_def = Section(label='Processing Step') + + step_name = Quantity( + type=str, + description='Name of the processing step', + a_eln=ELNAnnotation(label='Step Name', component='StringEditQuantity'), + ) + + method = Quantity( + type=str, + description='Method used in the processing step (e.g., spin-coating, dropcasting)', + a_eln=ELNAnnotation(label='Method', component='StringEditQuantity'), + ) + + atmosphere = SubSection( + section_def=ProcessingAtmosphere, + a_eln=ELNAnnotation(label='Atmosphere'), + ) + + temperature = Quantity( + type=float, + unit='°C', + description='Temperature during the step', + a_eln=ELNAnnotation( + label='Temperature', defaultDisplayUnit='°C', component='NumberEditQuantity' + ), + ) + + duration = Quantity( + type=float, + unit='s', + description='Duration of the step', + a_eln=ELNAnnotation( + label='Duration', defaultDisplayUnit='s', component='NumberEditQuantity' + ), + ) + + antisolvent = Quantity( + type=str, + description='Antisolvent used', + a_eln=ELNAnnotation(label='Antisolvent', component='StringEditQuantity'), + ) + + gas = Quantity( + type=str, + description='Gas used in the process', + a_eln=ELNAnnotation(label='Gas', component='StringEditQuantity'), + ) + + solution = SubSection( + section_def=ReactionSolution, a_eln=ELNAnnotation(label='Solution') + ) + + additional_parameters = Quantity( + type=JSON, + description='Any additional parameters specific to this processing step', + a_eln=ELNAnnotation(label='Additional Parameters'), + ) + + +# Deposition class +class Deposition(ArchiveSection): + steps = SubSection( + section_def=ProcessingStep, + repeats=True, + description='List of processing steps in order of execution. Only report conditions that have been explicitly reported.', + ) + + reviewer_additional_notes = Quantity( + type=str, + description='Any additional comments or observations', + a_eln=ELNAnnotation(label='Additional Notes', component='RichTextEditQuantity'), + ) + + additional_notes = Quantity( + type=str, description='Any additional comments or observations' + ) + + +# Layer class +class Layer(ArchiveSection): + name = Quantity( + type=str, + description='Name of the layer', + a_eln=ELNAnnotation(label='Layer Name', component='StringEditQuantity'), + ) + + thickness = Quantity( + type=float, + unit='nm', + description='Thickness of the layer', + a_eln=ELNAnnotation( + label='Thickness', + component='NumberEditQuantity', + defaultDisplayUnit='nm', + props=dict(minValue=0), + ), + ) + + functionality = Quantity( + type=MEnum( + [ + 'Hole-transport', + 'Electron-transport', + 'Contact', + 'Absorber', + 'Other', + 'Substrate', + ] + ), + description='Functionality of the layer', + a_eln=ELNAnnotation(label='Functionality', component='EnumEditQuantity'), + ) + + deposition = SubSection( + section_def=Deposition, a_eln=ELNAnnotation(label='Deposition') + ) + + +# PerovskiteSolarCell class +class LLMExtractedPerovskiteSolarCell(PublicationReference, Schema): + m_def = Section(label='LLM Extracted Perovskite Solar Cell') + + review_completed = Quantity( + type=bool, + description='True if the review of the data is completed', + default=False, + a_eln=ELNAnnotation(label='Review Completed', component='BoolEditQuantity'), + ) + + DOI_number = Quantity( + type=str, + description='DOI number of the publication', + a_eln=ELNAnnotation(label='DOI Number', component='URLEditQuantity'), + ) + + cell_stack = Quantity( + type=str, + shape=['*'], + description='The stack sequence of the cell.', + a_eln=ELNAnnotation(label='Cell Stack', component='StringEditQuantity'), + ) + + perovskite_composition = Quantity( + type=str, + description='Chemical formula of the perovskite absorber', + a_eln=ELNAnnotation( + label='Perovskite Composition', component='StringEditQuantity' + ), + ) + + device_architecture = Quantity( + type=MEnum(['pin', 'nip', 'back-contacted', 'front-contacted']), + description='Device architecture', + a_eln=ELNAnnotation(label='Device Architecture', component='EnumEditQuantity'), + ) + + pce = Quantity( + type=float, + description='Power Conversion Efficiency (PCE)', + a_eln=ELNAnnotation( + label='PCE', + component='NumberEditQuantity', + props=dict(minValue=0, maxValue=40), + ), + ) + + jsc = Quantity( + type=float, + unit='mA/cm**2', + description='Short-circuit current density (JSC)', + a_eln=ELNAnnotation( + label='JSC', defaultDisplayUnit='mA/cm**2', component='NumberEditQuantity' + ), + ) + + voc = Quantity( + type=float, + unit='V', + description='Open-circuit voltage (VOC)', + a_eln=ELNAnnotation( + label='VOC', component='NumberEditQuantity', props=dict(minValue=0) + ), + ) + + ff = Quantity( + type=float, + description='Fill Factor (FF)', + a_eln=ELNAnnotation( + label='Fill Factor', + component='NumberEditQuantity', + props=dict(minValue=0, maxValue=100), + ), + ) + + active_area = Quantity( + type=float, + unit='cm**2', + description='Reported active area of the solar cell.', + a_eln=ELNAnnotation( + label='Active Area', + component='NumberEditQuantity', + defaultDisplayUnit='cm**2', + props=dict(minValue=0), + ), + ) + + number_devices = Quantity( + type=int, + description='Number of devices over which the metrics have been averaged', + a_eln=ELNAnnotation(label='Number of Devices', component='NumberEditQuantity'), + ) + + averaged_quantities = Quantity( + type=bool, + description='True if metrics are averaged over multiple devices', + a_eln=ELNAnnotation(label='Averaged Quantities', component='BoolEditQuantity'), + ) + + light_source = SubSection( + section_def=LightSource, a_eln=ELNAnnotation(label='Light Source') + ) + + bandgap = Quantity( + type=float, + unit='eV', + description='Bandgap of the perovskite material in eV. Include this field only if the bandgap has been directly measured in the experiment.', + a_eln=ELNAnnotation( + label='Bandgap', + component='NumberEditQuantity', + props=dict(minValue=0.5, maxValue=4.0), + ), + ) + + encapsulation = Quantity( + type=str, + description='Encapsulation method, if any', + a_eln=ELNAnnotation(label='Encapsulation', component='StringEditQuantity'), + ) + + reviewer_additional_notes = Quantity( + type=str, + description='Any additional comments or observations', + a_eln=ELNAnnotation(label='Additional Notes', component='RichTextEditQuantity'), + ) + + additional_notes = Quantity( + type=str, description='Any additional comments or observations' + ) + + stability = SubSection( + section_def=Stability, a_eln=ELNAnnotation(label='Stability') + ) + + layers = SubSection( + section_def=Layer, repeats=True, a_eln=ELNAnnotation(label='Layers') + ) + + +m_package.__init_metainfo__() From 2e7e4d1c19412196638299d23cffa75fcbabe83b Mon Sep 17 00:00:00 2001 From: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com> Date: Wed, 6 Nov 2024 17:03:01 +0100 Subject: [PATCH 02/27] feat: migrate extraction schema to new version (#27) * feat: update schema to new LLM version * Added review base section which needs to be included in every section. * add additional field descriptions * Fixed typos and some other things --------- Co-authored-by: Pepe Marquez --- .DS_Store | Bin 0 -> 6148 bytes .../llm_extraction_schema.py | 179 ++++++++++++------ 2 files changed, 119 insertions(+), 60 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..29af2be25832289b77a036485b20994863661f54 GIT binary patch literal 6148 zcmeHK%}T>S5Z<-XrW7FuMUM+!3$~?*;w8lT0!H+pQWFw17_+5G&7l->)fe(jd>&_Z zH)1ho5jz9B-~8@oKgj+t#<(|+28=n3F&i2pN2Ni~-5A<1$%q`s2+MpLrZNQk-NgPn z;I~_>U@^;C@%{V5X`19kzyHBETGr-P+iu%k`_6xqrC$WaJokdx4O*8{reUoI;Z+is z6KCg4W<`)>@k|vYQ4A@!*GU%1(v$Nn%2lne19r#mOq{*NVle87;qZ9b6N{5!zb8gV zqvf(=?;aeUUQC|S=S;q8J~_~>WZPg3Z=ifF=f$68smvb0SLfGxgv0 Date: Wed, 6 Nov 2024 22:27:35 +0100 Subject: [PATCH 03/27] Draft of app for the extracted entries --- pyproject.toml | 8 +- .../apps/__init__.py | 11 + .../apps/llm_extracted_solarcells.py | 215 ++++++++++++++++++ 3 files changed, 233 insertions(+), 1 deletion(-) create mode 100644 src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py diff --git a/pyproject.toml b/pyproject.toml index 0dbd73f..00d3b2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,7 +132,13 @@ where = ["src"] [project.entry-points.'nomad.plugin'] perovskite_solar_cell = "perovskite_solar_cell_database:perovskite_solar_cell" +<<<<<<< HEAD solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" perovskite_composition = "perovskite_solar_cell_database:perovskite_composition" ion_parser = "perovskite_solar_cell_database:ion_parser" -llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" \ No newline at end of file +llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" +======= +llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" +solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" +llm_extracted_solar_cells = "perovskite_solar_cell_database.apps:llm_extracted_solar_cells" +>>>>>>> Draft of app for the extracted entries diff --git a/src/perovskite_solar_cell_database/apps/__init__.py b/src/perovskite_solar_cell_database/apps/__init__.py index b479cde..cb52bdd 100644 --- a/src/perovskite_solar_cell_database/apps/__init__.py +++ b/src/perovskite_solar_cell_database/apps/__init__.py @@ -1,5 +1,8 @@ from nomad.config.models.plugins import AppEntryPoint +from perovskite_solar_cell_database.apps.llm_extracted_solarcells import ( + llm_extracted_solar_cells, +) from perovskite_solar_cell_database.apps.solar_cell_app import solar_cell_app solar_cells = AppEntryPoint( @@ -12,3 +15,11 @@ """, app=solar_cell_app, ) + +llm_extracted_solar_cells = AppEntryPoint( + name='LLM Extracted Solar Cells', + description=""" + This app allows you to search **LLM extracted solar cell data** within NOMAD. + """, + app=llm_extracted_solar_cells, +) diff --git a/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py b/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py new file mode 100644 index 0000000..513390a --- /dev/null +++ b/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py @@ -0,0 +1,215 @@ +import yaml +from nomad.config.models.ui import ( + App, + Column, + Columns, + FilterMenu, + FilterMenus, + Filters, +) + +llm_extracted_solar_cells = App( + # Label of the App + label='LLM Extracted Solar Cells', + # Path used in the URL, must be unique + path='llm-extracted-solar-cells', + # Used to categorize apps in the explore menu + category='LLM strcutured data extraction', + # Brief description used in the app menu + description=""" + Explore the LLM extracted solar cells. + """, + # Longer description that can also use markdown + readme=""" + Explore LLM extracted solar cells. + """, + # Controls the available search filters. If you want to filter by + # quantities in a schema package, you need to load the schema package + # explicitly here. Note that you can use a glob syntax to load the + # entire package, or just a single schema from a package. + filters=Filters( + include=[ + '*#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell', + ] + ), + # Controls which columns are shown in the results table + columns=Columns( + selected=[ + 'authors', + # 'results.material.elements', + 'entry_type', + 'data.review_completed#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell', + 'references', + # 'data.lab_id#nomad_material_processing.combinatorial.ThinFilmCombinatorialSample' + ], + options={ + 'entry_type': Column(label='Entry type', align='left'), + 'entry_name': Column(label='Name', align='left'), + 'entry_create_time': Column(label='Entry time', align='left'), + 'authors': Column(label='Authors', align='left'), + 'upload_name': Column(label='Upload name', align='left'), + 'references': Column(label='References', align='left'), + 'data.review_completed#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell': Column( + label='Review completed', align='left' + ), # noqa: E501 + 'data.publication_title#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell': Column( + label='Publication title', align='left' + ), # noqa: E501 + # 'data.lab_id#nomad_htem_database.schema_packages.htem_package.HTEMLibrary': Column( # noqa: E501 + # label='Library ID', align='left' + # ), + 'results.material.elements': Column(label='Elements', align='left'), + }, + ), + # Dictionary of search filters that are always enabled for queries made + # within this app. This is especially important to narrow down the + # results to the wanted subset. Any available search filter can be + # targeted here. This example makes sure that only entries that use + # MySchema are included. + filters_locked={ + 'entry_type': 'LLMExtractedPerovskiteSolarCell', + }, + # Controls the filter menus shown on the left + filter_menus=FilterMenus( + options={ + 'material': FilterMenu(label='Material', level=0), + 'elements': FilterMenu(label='Elements / Formula', level=1, size='xl'), + 'eln': FilterMenu(label='Electronic Lab Notebook', level=0), + 'custom_quantities': FilterMenu( + label='User Defined Quantities', level=0, size='l' + ), + 'author': FilterMenu(label='Author / Origin / Dataset', level=0, size='m'), + 'metadata': FilterMenu(label='Visibility / IDs / Schema', level=0), + 'optimade': FilterMenu(label='Optimade', level=0, size='m'), + } + ), + # Controls the default dashboard shown in the search interface + dashboard=yaml.safe_load( + """ + widgets: + - type: terms + scale: linear + search_quantity: data.journal#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell + layout: + xxl: + minH: 3 + minW: 3 + h: 9 + w: 6 + y: 0 + x: 6 + xl: + minH: 3 + minW: 3 + h: 5 + w: 6 + y: 0 + x: 6 + lg: + minH: 3 + minW: 3 + h: 9 + w: 6 + y: 0 + x: 6 + md: + minH: 3 + minW: 3 + h: 5 + w: 4 + y: 0 + x: 4 + sm: + minH: 3 + minW: 3 + h: 5 + w: 3 + y: 0 + x: 4 + - type: terms + scale: linear + search_quantity: authors.name + title: Reviewer names + layout: + xxl: + minH: 3 + minW: 3 + h: 9 + w: 6 + y: 0 + x: 0 + xl: + minH: 3 + minW: 3 + h: 5 + w: 6 + y: 0 + x: 0 + lg: + minH: 3 + minW: 3 + h: 9 + w: 6 + y: 0 + x: 0 + md: + minH: 3 + minW: 3 + h: 5 + w: 4 + y: 0 + x: 0 + sm: + minH: 3 + minW: 3 + h: 5 + w: 4 + y: 0 + x: 0 + - type: histogram + autorange: false + nbins: 30 + y: + scale: linear + x: + search_quantity: data.publication_date#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell + layout: + xxl: + minH: 3 + minW: 3 + h: 3 + w: 8 + y: 0 + x: 12 + xl: + minH: 3 + minW: 3 + h: 3 + w: 7 + y: 0 + x: 12 + lg: + minH: 3 + minW: 3 + h: 3 + w: 8 + y: 0 + x: 12 + md: + minH: 3 + minW: 3 + h: 3 + w: 7 + y: 0 + x: 8 + sm: + minH: 3 + minW: 3 + h: 3 + w: 5 + y: 0 + x: 7 + + """ + ), +) From 33d025c0940e341ca44931c1080009c1257718af Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Wed, 6 Nov 2024 22:36:53 +0100 Subject: [PATCH 04/27] Fixed key in app. --- .../apps/llm_extracted_solarcells.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py b/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py index 513390a..7d28d84 100644 --- a/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py +++ b/src/perovskite_solar_cell_database/apps/llm_extracted_solarcells.py @@ -89,7 +89,7 @@ widgets: - type: terms scale: linear - search_quantity: data.journal#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell + quantity: data.journal#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell layout: xxl: minH: 3 @@ -128,7 +128,7 @@ x: 4 - type: terms scale: linear - search_quantity: authors.name + quantity: authors.name title: Reviewer names layout: xxl: @@ -172,7 +172,7 @@ y: scale: linear x: - search_quantity: data.publication_date#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell + quantity: data.publication_date#perovskite_solar_cell_database.llm_extraction_schema.LLMExtractedPerovskiteSolarCell layout: xxl: minH: 3 From 7f3ff50b6873a6d94c1e517ae742d0a48e4f6d7a Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Wed, 6 Nov 2024 22:56:18 +0100 Subject: [PATCH 05/27] Polishing some annotations in the ELN --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index b70fa6a..1d7dabc 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -54,7 +54,7 @@ class PerovskiteComposition(SectionRevision): dimensionality = Quantity( type=MEnum(['0D', '1D', '2D', '3D', '2D/3D']), description='Dimensionality of the perovskite structure', - a_eln=ELNAnnotation(label='Dimensionality', component='EnumEditQuantity'), + a_eln=ELNAnnotation(label='Dimensionality', component='RadioEditQuantity'), ) ions_a_site = SubSection( @@ -274,7 +274,7 @@ class ReactionSolution(SectionRevision): # ProcessingStep class class ProcessingStep(SectionRevision): - m_def = Section(label='Processing Step') + m_def = Section(label='Processing Step', label_quantity='method') step_name = Quantity( type=str, From 0d4d2d7c428181352f37b7d3efd888e51b860e56 Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Thu, 7 Nov 2024 10:02:23 +0100 Subject: [PATCH 06/27] Added proper radio ELN component --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 1d7dabc..842f024 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -54,7 +54,7 @@ class PerovskiteComposition(SectionRevision): dimensionality = Quantity( type=MEnum(['0D', '1D', '2D', '3D', '2D/3D']), description='Dimensionality of the perovskite structure', - a_eln=ELNAnnotation(label='Dimensionality', component='RadioEditQuantity'), + a_eln=ELNAnnotation(label='Dimensionality', component='RadioEnumEditQuantity'), ) ions_a_site = SubSection( From 73c61121a302c796cd8de3fb230e6d7c9461477e Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Thu, 7 Nov 2024 10:03:32 +0100 Subject: [PATCH 07/27] fix typo in key --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 842f024..a6db5b9 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -61,7 +61,7 @@ class PerovskiteComposition(SectionRevision): section_def=Ion, repeats=True, a_eln=ELNAnnotation(label='A-site Ions') ) - b_ions_b_site = SubSection( + ions_b_site = SubSection( section_def=Ion, repeats=True, a_eln=ELNAnnotation(label='B-site Ions') ) From 5c306f53efd81bc2865b3b9e5e32bed9b4e584fd Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Thu, 7 Nov 2024 13:59:06 +0100 Subject: [PATCH 08/27] Changed the enum of the device architecture. --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index a6db5b9..d7cc19e 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -428,7 +428,7 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch ) device_architecture = Quantity( - type=MEnum(['pin', 'nip', 'back-contacted', 'front-contacted']), + type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted']), description='Device architecture', a_eln=ELNAnnotation(label='Device Architecture', component='EnumEditQuantity'), ) From ef6ed19dafad1a92096fee387ab08b54503fcc17 Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Thu, 7 Nov 2024 21:35:32 +0100 Subject: [PATCH 09/27] Unsaved fixes in pyproject --- pyproject.toml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 00d3b2b..66fbda6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -132,13 +132,7 @@ where = ["src"] [project.entry-points.'nomad.plugin'] perovskite_solar_cell = "perovskite_solar_cell_database:perovskite_solar_cell" -<<<<<<< HEAD solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" perovskite_composition = "perovskite_solar_cell_database:perovskite_composition" ion_parser = "perovskite_solar_cell_database:ion_parser" llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" -======= -llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" -solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" -llm_extracted_solar_cells = "perovskite_solar_cell_database.apps:llm_extracted_solar_cells" ->>>>>>> Draft of app for the extracted entries From d538def7b42fd03564750f41c66115e11a4fa2cf Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Fri, 8 Nov 2024 09:46:20 +0100 Subject: [PATCH 10/27] Changed atmosphese from section to enum --- .../llm_extraction_schema.py | 39 +++---------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index d7cc19e..5c32235 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -209,36 +209,6 @@ class Stability(SectionRevision): ) -# ProcessingAtmosphere class -class ProcessingAtmosphere(SectionRevision): - m_def = Section(label='Processing Atmosphere') - - type = Quantity( - type=str, - description='Type of atmosphere', - a_eln=ELNAnnotation(label='Atmosphere Type', component='StringEditQuantity'), - ) - - pressure = Quantity( - type=float, - unit='mbar', - description='Pressure during processing', - a_eln=ELNAnnotation( - label='Pressure', defaultDisplayUnit='mbar', component='NumberEditQuantity' - ), - ) - - relative_humidity = Quantity( - type=float, - description='Relative humidity during processing', - a_eln=ELNAnnotation( - label='Relative Humidity', - component='NumberEditQuantity', - props=dict(minValue=0, maxValue=100), - ), - ) - - # ReactionSolution class class ReactionSolution(SectionRevision): m_def = Section(label='Reaction Solution') @@ -288,9 +258,12 @@ class ProcessingStep(SectionRevision): a_eln=ELNAnnotation(label='Method', component='StringEditQuantity'), ) - atmosphere = SubSection( - section_def=ProcessingAtmosphere, - a_eln=ELNAnnotation(label='Atmosphere'), + atmosphere = Quantity( + type=MEnum( + ['Ambient air', 'Dry air', 'Air', 'N2', 'Ar', 'He', 'H2', 'Vacuum', 'Other'] + ), + description='Atmosphere during the step', + a_eln=ELNAnnotation(label='Atmosphere', component='EnumEditQuantity'), ) temperature = Quantity( From 52608136d1c881dc3d6abf3aeae3b81c74b8e9e0 Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Fri, 8 Nov 2024 09:53:40 +0100 Subject: [PATCH 11/27] Replaced Perovskite composition --- .../llm_extraction_schema.py | 28 ++----------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 5c32235..b328bf5 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -7,6 +7,7 @@ from nomad.datamodel.metainfo.eln import ELNAnnotation from nomad.metainfo import JSON, Quantity, Section, SubSection from nomad.metainfo.metainfo import MEnum +from perovskite_solar_cell_database.composition import PerovskiteCompositionSection if TYPE_CHECKING: pass @@ -42,32 +43,9 @@ class Ion(SectionRevision): ) -class PerovskiteComposition(SectionRevision): +class PerovskiteComposition(SectionRevision, PerovskiteCompositionSection): m_def = Section(label='Perovskite Composition') - - formula = Quantity( - type=str, - description='The perovskite composition according to IUPAC recommendations, where standard abbreviations are used for all ions', - a_eln=ELNAnnotation(label='Formula', component='StringEditQuantity'), - ) - - dimensionality = Quantity( - type=MEnum(['0D', '1D', '2D', '3D', '2D/3D']), - description='Dimensionality of the perovskite structure', - a_eln=ELNAnnotation(label='Dimensionality', component='RadioEnumEditQuantity'), - ) - - ions_a_site = SubSection( - section_def=Ion, repeats=True, a_eln=ELNAnnotation(label='A-site Ions') - ) - - ions_b_site = SubSection( - section_def=Ion, repeats=True, a_eln=ELNAnnotation(label='B-site Ions') - ) - - ions_x_site = SubSection( - section_def=Ion, repeats=True, a_eln=ELNAnnotation(label='X-site Ions') - ) + pass # LightSource class From b085255bb3020d576e1efd6b019a603c8419e055 Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Fri, 8 Nov 2024 10:00:42 +0100 Subject: [PATCH 12/27] Organized imports --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index b328bf5..60526c5 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -7,6 +7,7 @@ from nomad.datamodel.metainfo.eln import ELNAnnotation from nomad.metainfo import JSON, Quantity, Section, SubSection from nomad.metainfo.metainfo import MEnum + from perovskite_solar_cell_database.composition import PerovskiteCompositionSection if TYPE_CHECKING: From da73e46bbe118c9b2b2a47751d9fe4951800945e Mon Sep 17 00:00:00 2001 From: Pepe Marquez Date: Fri, 8 Nov 2024 11:00:28 +0100 Subject: [PATCH 13/27] Added app for the extracted solar cells review --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 66fbda6..abec33d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,3 +136,4 @@ solar_cell_app = "perovskite_solar_cell_database.apps:solar_cells" perovskite_composition = "perovskite_solar_cell_database:perovskite_composition" ion_parser = "perovskite_solar_cell_database:ion_parser" llm_extraction_schema = "perovskite_solar_cell_database:llm_extraction_schema" +llm_extracted_solar_cells = "perovskite_solar_cell_database.apps:llm_extracted_solar_cells" From 668e98fa428366c77f82588d78a13a212dcd4442 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Fri, 8 Nov 2024 12:24:45 +0100 Subject: [PATCH 14/27] Changed coefficient to str --- src/perovskite_solar_cell_database/composition.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/perovskite_solar_cell_database/composition.py b/src/perovskite_solar_cell_database/composition.py index 2be94c0..75c2b33 100644 --- a/src/perovskite_solar_cell_database/composition.py +++ b/src/perovskite_solar_cell_database/composition.py @@ -463,7 +463,7 @@ class PerovskiteIonComponent(SystemComponent, PerovskiteIonSection): ) ) coefficient = Quantity( - type=float, + type=str, description='The stoichiometric coefficient', a_eln=ELNAnnotation(component=ELNComponentEnum.NumberEditQuantity), shape=[], @@ -831,12 +831,10 @@ def normalize(self, archive: 'EntryArchive', logger: 'BoundLogger') -> None: self.short_form += ion.abbreviation if ion.coefficient is None: continue - if ion.coefficient == 1: + if ion.coefficient == '1': coefficient_str = '' - elif ion.coefficient == int(ion.coefficient): - coefficient_str = str(int(ion.coefficient)) else: - coefficient_str = f'{ion.coefficient:.2}' + coefficient_str = ion.coefficient self.long_form += f'{ion.abbreviation}{coefficient_str}' if not isinstance(ion.molecular_formula, str): continue From 6e7fb6bca8080d4978e21342ff7958ad58a0f1ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Fri, 8 Nov 2024 12:27:56 +0100 Subject: [PATCH 15/27] Changed descriptive formula to long form --- src/perovskite_solar_cell_database/composition.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/perovskite_solar_cell_database/composition.py b/src/perovskite_solar_cell_database/composition.py index 75c2b33..31a4c66 100644 --- a/src/perovskite_solar_cell_database/composition.py +++ b/src/perovskite_solar_cell_database/composition.py @@ -860,6 +860,7 @@ def normalize(self, archive: 'EntryArchive', logger: 'BoundLogger') -> None: label='Perovskite Composition', description='A system describing the chemistry and components of the perovskite.', system_relation=Relation(type='root'), + chemical_formula_descriptive=self.long_form, ) parent_system.structural_type = archive.results.material.structural_type From feef2ea7a71d0840ef89b79a8e7ddc2d2257c4b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Fri, 8 Nov 2024 14:05:47 +0100 Subject: [PATCH 16/27] Changed edit quantity for coefficient --- src/perovskite_solar_cell_database/composition.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/composition.py b/src/perovskite_solar_cell_database/composition.py index 31a4c66..ed4f002 100644 --- a/src/perovskite_solar_cell_database/composition.py +++ b/src/perovskite_solar_cell_database/composition.py @@ -465,7 +465,7 @@ class PerovskiteIonComponent(SystemComponent, PerovskiteIonSection): coefficient = Quantity( type=str, description='The stoichiometric coefficient', - a_eln=ELNAnnotation(component=ELNComponentEnum.NumberEditQuantity), + a_eln=ELNAnnotation(component=ELNComponentEnum.StringEditQuantity), shape=[], ) system = Quantity( From 8e986bd74f8c8939bdbe0cb2d87157e7639f2939 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sun, 10 Nov 2024 11:33:06 +0100 Subject: [PATCH 17/27] feat: update schema to match the pydantic model and add field for updating ordering --- .../llm_extraction_schema.py | 33 +++++++++++++++---- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 60526c5..9705709 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -367,13 +367,6 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch a_eln=ELNAnnotation(label='DOI Number', component='URLEditQuantity'), ) - cell_stack = Quantity( - type=str, - shape=['*'], - description='The stack sequence of the cell.', - a_eln=ELNAnnotation(label='Cell Stack', component='StringEditQuantity'), - ) - perovskite_composition = SubSection( section_def=PerovskiteComposition, a_eln=ELNAnnotation(label='Perovskite Composition'), @@ -486,5 +479,31 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch section_def=Layer, repeats=True, a_eln=ELNAnnotation(label='Layers') ) + layer_order = Quantity( + type=str, + description='Order of the layers in the device stack. Use the layer names as they appear in the "Layers" section, separated by commas.', + a_eln=ELNAnnotation(label='Layer Order', component='StringEditQuantity'), + ) + + # normalizer that reorderes the layers according to the layer_order + def normalize(self): + if self.layer_order: + layer_order = self.layer_order.split(',') + layers = self.layers + new_layers = [] + for layer_name in layer_order: + layer_name_stripped = layer_name.strip() + for layer in layers: + if layer.name == layer_name_stripped: + self.layers.append(layer) + break + + # if the new list is not the same length as the old one + # then the are some issues with the keys and we should raise an error + if len(new_layers) != len(layers): + raise ValueError( + 'The layer order is not valid. Please check the layer names and try again.' + ) + m_package.__init_metainfo__() From 3a656140bd22d7f4f8bad0af5796bad237f3eb30 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sun, 10 Nov 2024 11:36:20 +0100 Subject: [PATCH 18/27] chore: update signature of normalizer function --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 9705709..6b9a18d 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -486,7 +486,7 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch ) # normalizer that reorderes the layers according to the layer_order - def normalize(self): + def normalize(self, archive, logger): if self.layer_order: layer_order = self.layer_order.split(',') layers = self.layers From 54ad88d15eb4fed9d30f9ad6dda4566d8cbedbb3 Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sun, 10 Nov 2024 11:39:32 +0100 Subject: [PATCH 19/27] fix: bug in normalizer --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 6b9a18d..8e1d8ca 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -495,7 +495,7 @@ def normalize(self, archive, logger): layer_name_stripped = layer_name.strip() for layer in layers: if layer.name == layer_name_stripped: - self.layers.append(layer) + new_layers.append(layer) break # if the new list is not the same length as the old one @@ -504,6 +504,7 @@ def normalize(self, archive, logger): raise ValueError( 'The layer order is not valid. Please check the layer names and try again.' ) - + else: + self.layers = new_layers m_package.__init_metainfo__() From fa208a795d47d6be76a57a054f724708b7c9965b Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Sun, 10 Nov 2024 11:41:52 +0100 Subject: [PATCH 20/27] chore: use better code from sourcery --- .../llm_extraction_schema.py | 31 +++++++------------ 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 8e1d8ca..f05d4f2 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -487,24 +487,17 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch # normalizer that reorderes the layers according to the layer_order def normalize(self, archive, logger): - if self.layer_order: - layer_order = self.layer_order.split(',') - layers = self.layers - new_layers = [] - for layer_name in layer_order: - layer_name_stripped = layer_name.strip() - for layer in layers: - if layer.name == layer_name_stripped: - new_layers.append(layer) - break - - # if the new list is not the same length as the old one - # then the are some issues with the keys and we should raise an error - if len(new_layers) != len(layers): - raise ValueError( - 'The layer order is not valid. Please check the layer names and try again.' - ) - else: - self.layers = new_layers + if not self.layer_order: + return + + layer_dict = {layer.name: layer for layer in self.layers} + ordered_names = [name.strip() for name in self.layer_order.split(',')] + + if set(ordered_names) != set(layer_dict.keys()): + raise ValueError('Layer order does not match available layers') + + # Reorder in single pass + self.layers = [layer_dict[name] for name in ordered_names] + m_package.__init_metainfo__() From b893b2fc6d4ff3b5ac95caa49b0122e42220f430 Mon Sep 17 00:00:00 2001 From: Kevin M Jablonka <32935233+kjappelbaum@users.noreply.github.com> Date: Mon, 11 Nov 2024 08:25:06 +0100 Subject: [PATCH 21/27] Update src/perovskite_solar_cell_database/llm_extraction_schema.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Hampus Näsström --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index f05d4f2..4835cb0 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -494,7 +494,8 @@ def normalize(self, archive, logger): ordered_names = [name.strip() for name in self.layer_order.split(',')] if set(ordered_names) != set(layer_dict.keys()): - raise ValueError('Layer order does not match available layers') + logger.warn('The names in layer_order does not match available layers') + return # Reorder in single pass self.layers = [layer_dict[name] for name in ordered_names] From a009295d4ff522ec9a1b08ddfa78993183141252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Wed, 13 Nov 2024 15:24:49 +0100 Subject: [PATCH 22/27] Added missing super normalize call and corrected default display unit --- src/perovskite_solar_cell_database/composition.py | 2 +- src/perovskite_solar_cell_database/llm_extraction_schema.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/composition.py b/src/perovskite_solar_cell_database/composition.py index ed4f002..0b8dc94 100644 --- a/src/perovskite_solar_cell_database/composition.py +++ b/src/perovskite_solar_cell_database/composition.py @@ -692,7 +692,7 @@ class Impurity(PureSubstanceComponent, PerovskiteChemicalSection): type=float, description='The concentration of the additive or impurity.', a_eln=ELNAnnotation( - component=ELNComponentEnum.NumberEditQuantity, defaultDisplayUnit='mol%' + component=ELNComponentEnum.NumberEditQuantity, defaultDisplayUnit='cm^-3' ), unit='cm^-3', shape=[], diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 4835cb0..16a65cc 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -499,6 +499,7 @@ def normalize(self, archive, logger): # Reorder in single pass self.layers = [layer_dict[name] for name in ordered_names] + super().normalize(archive, logger) m_package.__init_metainfo__() From 7f021424911cf8068f568d9d0ab28d64b4e81c7d Mon Sep 17 00:00:00 2001 From: Sherjeel Shabih Date: Fri, 15 Nov 2024 14:34:32 +0100 Subject: [PATCH 23/27] Implements all feedback from Jesper --- .../llm_extraction_schema.py | 44 ++++++++++++++++--- 1 file changed, 37 insertions(+), 7 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 16a65cc..f3e65c1 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -63,6 +63,7 @@ class LightSource(SectionRevision): 'White LED', 'Other', 'Outdoor', + '' ] ), description='Type of light source', @@ -110,7 +111,7 @@ class Solute(SectionRevision): ) concentration_unit = Quantity( - type=MEnum(['mol/L', 'mmol/L', 'g/L', 'mg/L', 'wt%', 'vol%', 'M']), + type=MEnum(['mol/L', 'mmol/L', 'g/L', 'mg/L', 'wt%', 'vol%', 'M', '']), description='Unit of concentration', a_eln=ELNAnnotation(label='Concentration Unit', component='EnumEditQuantity'), ) @@ -187,6 +188,30 @@ class Stability(SectionRevision): a_eln=ELNAnnotation(label='PCE at End', component='NumberEditQuantity'), ) + potential_bias = Quantity( + type=MEnum( + ['Open circuit', 'MPPT', 'Constant potential', 'Constant current', 'Constant resistance', ''] + ), + description='Potential bias during stability test', + a_eln=ELNAnnotation(label='Potential Bias', component='EnumEditQuantity'), + ) + + + +class Solvent(SectionRevision): + m_def = Section(label='Solvent') + + name = Quantity( + type=str, + description='Name of the solvent', + a_eln=ELNAnnotation(label='Name', component='StringEditQuantity'), + ) + + ratio = Quantity( + type=float, + description='Ratio of this solvent with respect to others - (0-1)', + a_eln=ELNAnnotation(label='Concentration', component='NumberEditQuantity'), + ) # ReactionSolution class class ReactionSolution(SectionRevision): @@ -214,10 +239,8 @@ class ReactionSolution(SectionRevision): ), ) - solvent = Quantity( - type=str, - description='Solvent used', - a_eln=ELNAnnotation(label='Solvent', component='StringEditQuantity'), + solvents = SubSection( + section_def=Solvent, repeats=True, a_eln=ELNAnnotation(label='Solvents') ) @@ -239,7 +262,7 @@ class ProcessingStep(SectionRevision): atmosphere = Quantity( type=MEnum( - ['Ambient air', 'Dry air', 'Air', 'N2', 'Ar', 'He', 'H2', 'Vacuum', 'Other'] + ['Ambient air', 'Dry air', 'Air', 'N2', 'Ar', 'He', 'H2', 'Vacuum', 'Other', ''] ), description='Atmosphere during the step', a_eln=ELNAnnotation(label='Atmosphere', component='EnumEditQuantity'), @@ -269,6 +292,12 @@ class ProcessingStep(SectionRevision): a_eln=ELNAnnotation(label='Gas Quenching', component='BoolEditQuantity'), ) + antisolvent_quenching = Quantity( + type=bool, + description='Whether antisolvent quenching was used', + a_eln=ELNAnnotation(label='Antisolvent Quenching', component='BoolEditQuantity'), + ) + solution = SubSection( section_def=ReactionSolution, a_eln=ELNAnnotation(label='Solution') ) @@ -328,6 +357,7 @@ class Layer(SectionRevision): 'Absorber', 'Other', 'Substrate', + '' ] ), description='Functionality of the layer', @@ -373,7 +403,7 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch ) device_architecture = Quantity( - type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted']), + type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted', 'Other', '']), description='Device architecture', a_eln=ELNAnnotation(label='Device Architecture', component='EnumEditQuantity'), ) From b894fd8dbc5d9b1da767849a3387de1b241685b0 Mon Sep 17 00:00:00 2001 From: Sherjeel Shabih Date: Fri, 15 Nov 2024 14:39:31 +0100 Subject: [PATCH 24/27] ruff --- .../llm_extraction_schema.py | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index f3e65c1..2384d9f 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -63,7 +63,7 @@ class LightSource(SectionRevision): 'White LED', 'Other', 'Outdoor', - '' + '', ] ), description='Type of light source', @@ -190,14 +190,20 @@ class Stability(SectionRevision): potential_bias = Quantity( type=MEnum( - ['Open circuit', 'MPPT', 'Constant potential', 'Constant current', 'Constant resistance', ''] + [ + 'Open circuit', + 'MPPT', + 'Constant potential', + 'Constant current', + 'Constant resistance', + '', + ] ), description='Potential bias during stability test', a_eln=ELNAnnotation(label='Potential Bias', component='EnumEditQuantity'), ) - class Solvent(SectionRevision): m_def = Section(label='Solvent') @@ -213,6 +219,7 @@ class Solvent(SectionRevision): a_eln=ELNAnnotation(label='Concentration', component='NumberEditQuantity'), ) + # ReactionSolution class class ReactionSolution(SectionRevision): m_def = Section(label='Reaction Solution') @@ -262,7 +269,18 @@ class ProcessingStep(SectionRevision): atmosphere = Quantity( type=MEnum( - ['Ambient air', 'Dry air', 'Air', 'N2', 'Ar', 'He', 'H2', 'Vacuum', 'Other', ''] + [ + 'Ambient air', + 'Dry air', + 'Air', + 'N2', + 'Ar', + 'He', + 'H2', + 'Vacuum', + 'Other', + '', + ] ), description='Atmosphere during the step', a_eln=ELNAnnotation(label='Atmosphere', component='EnumEditQuantity'), @@ -295,7 +313,9 @@ class ProcessingStep(SectionRevision): antisolvent_quenching = Quantity( type=bool, description='Whether antisolvent quenching was used', - a_eln=ELNAnnotation(label='Antisolvent Quenching', component='BoolEditQuantity'), + a_eln=ELNAnnotation( + label='Antisolvent Quenching', component='BoolEditQuantity' + ), ) solution = SubSection( @@ -357,7 +377,7 @@ class Layer(SectionRevision): 'Absorber', 'Other', 'Substrate', - '' + '', ] ), description='Functionality of the layer', From a1b512f5a80983b813d4f04d457c3c0611f2e61f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Fri, 15 Nov 2024 15:21:20 +0100 Subject: [PATCH 25/27] Changed empty strings to Unknown and changed solvent ratio to volume_fraction --- .../llm_extraction_schema.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 2384d9f..0ce14b1 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -63,7 +63,7 @@ class LightSource(SectionRevision): 'White LED', 'Other', 'Outdoor', - '', + 'Unknown', ] ), description='Type of light source', @@ -111,7 +111,7 @@ class Solute(SectionRevision): ) concentration_unit = Quantity( - type=MEnum(['mol/L', 'mmol/L', 'g/L', 'mg/L', 'wt%', 'vol%', 'M', '']), + type=MEnum(['mol/L', 'mmol/L', 'g/L', 'mg/L', 'wt%', 'vol%', 'M', 'Unknown']), description='Unit of concentration', a_eln=ELNAnnotation(label='Concentration Unit', component='EnumEditQuantity'), ) @@ -196,7 +196,7 @@ class Stability(SectionRevision): 'Constant potential', 'Constant current', 'Constant resistance', - '', + 'Unknown', ] ), description='Potential bias during stability test', @@ -213,9 +213,9 @@ class Solvent(SectionRevision): a_eln=ELNAnnotation(label='Name', component='StringEditQuantity'), ) - ratio = Quantity( + volume_fraction = Quantity( type=float, - description='Ratio of this solvent with respect to others - (0-1)', + description='The volume fraction of the solvent with respect to the other solvents in the solution', a_eln=ELNAnnotation(label='Concentration', component='NumberEditQuantity'), ) @@ -279,7 +279,7 @@ class ProcessingStep(SectionRevision): 'H2', 'Vacuum', 'Other', - '', + 'Unknown', ] ), description='Atmosphere during the step', @@ -377,7 +377,7 @@ class Layer(SectionRevision): 'Absorber', 'Other', 'Substrate', - '', + 'Unknown', ] ), description='Functionality of the layer', @@ -423,7 +423,7 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch ) device_architecture = Quantity( - type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted', 'Other', '']), + type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted', 'Other', 'Unknown']), description='Device architecture', a_eln=ELNAnnotation(label='Device Architecture', component='EnumEditQuantity'), ) From dc2f410b99ee5b08c151a7c0e18258dce60fa98a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hampus=20N=C3=A4sstr=C3=B6m?= Date: Fri, 15 Nov 2024 15:23:37 +0100 Subject: [PATCH 26/27] Ruff --- src/perovskite_solar_cell_database/llm_extraction_schema.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 0ce14b1..143bf19 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -423,7 +423,9 @@ class LLMExtractedPerovskiteSolarCell(PublicationReference, SectionRevision, Sch ) device_architecture = Quantity( - type=MEnum(['pin', 'nip', 'Back contacted', 'Front contacted', 'Other', 'Unknown']), + type=MEnum( + ['pin', 'nip', 'Back contacted', 'Front contacted', 'Other', 'Unknown'] + ), description='Device architecture', a_eln=ELNAnnotation(label='Device Architecture', component='EnumEditQuantity'), ) From 8082c52ebbb1be8b3f0c3082ef98a17e91355860 Mon Sep 17 00:00:00 2001 From: Sherjeel Shabih Date: Fri, 15 Nov 2024 15:28:48 +0100 Subject: [PATCH 27/27] Adds recursive func to delete all 'Unknown' enums --- .../llm_extraction_schema.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/perovskite_solar_cell_database/llm_extraction_schema.py b/src/perovskite_solar_cell_database/llm_extraction_schema.py index 143bf19..9f7294b 100644 --- a/src/perovskite_solar_cell_database/llm_extraction_schema.py +++ b/src/perovskite_solar_cell_database/llm_extraction_schema.py @@ -551,6 +551,25 @@ def normalize(self, archive, logger): # Reorder in single pass self.layers = [layer_dict[name] for name in ordered_names] + + # Recursive function to traverse sections and set MEnum values to None when they 'Unknown' + def reset_menum_values(section): + for attribute_name in dir(section): + attribute = getattr(section, attribute_name, None) + if isinstance(attribute, MEnum) and attribute == 'Unknown': + setattr(section, attribute_name, None) + elif isinstance(attribute, list): + # Process lists of subsections + for item in attribute: + if isinstance(item, ArchiveSection): + reset_menum_values(item) + elif isinstance(attribute, ArchiveSection): + # Recursively process subsections + reset_menum_values(attribute) + + # Start normalization from the current section + reset_menum_values(self) + super().normalize(archive, logger)