Skip to content

Latest commit

 

History

History
318 lines (237 loc) · 8.91 KB

README.rst

File metadata and controls

318 lines (237 loc) · 8.91 KB

https://mindey.com/docs/metaform-logo.png

One star knowing it all...

# HAVING DATA:
DATA = {
  'a': 1.5,
  'b': 1458266965.250572,
  'c': [{'x': {'y': 'LT121000011101001000'}}, {'z': 'Omega'}]}

# GETTING KNOWLEDGE:
SCHEMA = {
  'a': 'price#EUR|to.decimal',
  'b': 'timestamp#date|to.unixtime',
  'c': [{'*': 'contributions',
    'x': {'*': 'origins', 'y': 'account#IBAN|to.string'},
    'z': 'company#name|to.string'}],
}

# NORMALIZATION
metaform.load(DATA).format(SCHEMA)

# OR USE '*' TO REFERENCE SCHEMA TO PACK SCHEMA WITH DATA PACKETS:
metaform.load(dict(DATA, **{'*': 'https://github.com/wefindx/schema/wiki/Sale#test'})).format()

metaform

Join the chat at https://gitter.im/wefindx/metaform

Metaform is a package for hierarchical and nested data normalization.

https://wiki.mindey.com/shared/shots/53dcf81b7efd0573f07c5f562.png

Basic Usage

pip install metaform

import metaform

If your data had an asterisk~!

# INPUT
metaform.load({
  '*': 'https://github.com/mindey/terms/wiki/person#foaf',
  'url': 'http://dbpedia.org/resource/John_Lennon',
  'fullname': 'John Lennon',
  'birthdate': '1940-10-09',
  'spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}).format(refresh=True)
# (schemas are cached locally, pass refresh=True to redownload)

# OUTPUT
{
  '*': 'GH:mindey/terms/person#foaf',
  'jsonld:id': 'http://dbpedia.org/resource/John_Lennon',
  'foaf:name': 'John Lennon',
  'schema:birthDate': datetime.datetime(1940, 10, 9, 0, 0),
  'schema:spouse': 'http://dbpedia.org/resource/Cynthia_Lennon'
}

For example, if you read an API data source:

data = metaform.load('https://www.metaculus.com/api2/questions/')
data['*'] ='https://github.com/wefindx/schema/wiki/Topic#metaculuscom-question'
data.format()
# Try it!

Or, if your filenames had references to schema~

df = metaform.read_csv(
  'https://gist.githubusercontent.com/mindey/3f2596e108a5c151f32e1967275a7689/raw/7c4c963219255008fdb438e8b9777cd658eea02e/hello-world.csv',
  schema={
    0: 'Timestamp|to.unixtime',
    1: 'KeyUpOrDown|lambda x: x=="k↓" and "KeyDown" or (x=="k↑" and "KeyUp")',
    2: 'KeyName'},
  header=None
)

Alternatively, save schema to wiki like here, and include the schema token inside filename by encoding it as sub-extension, that is, rename hello-world.csv to [email protected]:

# To get schema token for filename (GH~mindey+schema+KeyEvent@mykeylogger-01) do:
metaform.metawiki.url2ext('https://github.com/mindey/schema/wiki/KeyEvent#mykeylogger-01')

# Then rename filename in the source, and just read file remotely or locally from disk:
df = metaform.read_csv('https://gist.githubusercontent.com/mindey/f33978b31468097b5003f032d5d85eb8/raw/9541191e4d99c052a7668223697ef0ef9ce37977/[email protected]', header=None)

So, what's happening here?

metaform.load( DATA ).format( SCHEMA )

Let’s say we have some data:

data = {
    'hello': 1.0,
    'world': 2,
    'how': ['is', {'are': {'you': 'doing'}}]
}

We can get the template for defining schema, by metaform.template:

metaform.template(data)
{'*': '',
 'hello': {'*': ''},
 'how': [{'*': '', 'are': {'you': {'*': ''}}}],
 'world': {'*': ''}}

This provides an opportunity to specify metadata for each key and the object itself. For example:

schema = {                        # A #    schema = {
    '*': 'greeting',              # L #        '*': 'greeting',
    'hello': {'*': 'length'},     # T #        'hello': 'length',
    'world': {'*': 'atoms'},      # E #        'world': 'atoms',
    'how': [                      # R #        'how': [
         {'*': 'method',          # N #             {'*': 'method',
          'are': {                # A #              'are': {
              '*': 'yup',         # T #                  '*': 'yup',
              'you': {'*': 'me'}} # I #                  'you': {'*': 'me'}}
         }                        # V #             }
    ]}                            # E #        ]}

metaform.normalize(data, schema)
{'atoms': 2, 'length': 1.0, 'method': ['is', {'yup': {'me': 'doing'}}]}

We recommend saving schemas you create for normalizations for data analytics and driver projects in dot-folders .schema, in a JSON or YAML files in that folder.

So, we have access to all keys, and can specify, what to do with them:

schema = {
    '*': 'greeting',
    'hello': 'length|lambda x: x+5.',
    'world': 'atoms|lambda x: str(x)+"ABC"',
    'how': [
         {'*': 'method',
          'are': {
              '*': 'yup',
              'you': {'*': 'me|lambda x: "-".join(list(x))'}}
         }
    ]}

metaform.normalize(data, schema)
{'atoms': '2ABC',
 'length': 6.0,
 'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}

And suppose, we want to define a more complex function, inconvenient via lambdas:

from metaform import converters

def some_func(x):
    a = 123
    b = 345
    return (b-a)*x

converters.func = some_func

schema = {
    '*': 'greeting',
    'hello': 'length|to.func',
    'world': 'atoms|lambda x: str(x)+"ABC"',
    'how': [
         {'*': 'method',
          'are': {
              '*': 'yup',
              'you': {'*': 'me|lambda x: "-".join(list(x))'}}
         }
    ]}

metaform.normalize(data, schema)
{'atoms': '2ABC',
 'length': 222.0,
 'method': ['is', {'yup': {'me': 'd-o-i-n-g'}}]}

We just renamed the keys, and normalized values! What else could we want?

Normalizing Data

Suppose we have similar data from different sources. For example, topics and comments are not so different after all, because if a comment becomes large enough, it can stand as a topic of its own.

topics = requests.get('https://api.infty.xyz/topics/?format=json').json()['results']
comments = requests.get('https://api.infty.xyz/comments/?format=json').json()['results']

Let’s define templates for them, with the key names and types to match:

topics_schema = [{
  'id': 'topic-id',
  'type': '|lambda x: {0: "NEED", 1: "GOAL", 2: "IDEA", 3: "PLAN", 4: "STEP", 5: "TASK"}.get(x)',
  'owner': {'id': 'user-id'},
  'blockchain': '|lambda x: x and True or False',
}]

normal_topics = metaform.normalize(topics, topics_schema)

topics_df = pandas.io.json.json_normalize(normal_topics)
topics_df.dtypes
blockchain             bool
body                 object
categories           object
categories_names     object
children             object
comment_count         int64
created_date         object
data                 object
declared            float64
editors              object
funds               float64
is_draft               bool
languages            object
matched             float64
owner.user-id         int64
owner.username       object
parents              object
title                object
topic-id              int64
type                 object
updated_date         object
url                  object
dtype: object
comments_schema = [{
  'id': 'comment-id',
  'topic': 'topic-url',
  'text': 'body',
  'owner': {'id': 'user-id'},
  'blockchain': '|lambda x: x and True or False',
}]

normal_comments = metaform.normalize(comments, comments_schema)

comments_df = pandas.io.json.json_normalize(normal_comments)
comments_df.dtypes
assumed_hours      object
blockchain           bool
body               object
claimed_hours      object
comment-id          int64
created_date       object
donated           float64
languages          object
matched           float64
owner.user-id       int64
owner.username     object
parent             object
remains           float64
topic-url          object
updated_date       object
url                object
dtype: object
df = pandas.concat([topics_df, comments_df], sort=False)