diff --git a/.travis.yml b/.travis.yml index 592935c6..f49ef558 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,5 @@ language: python python: - - "2.6" - "2.7" # - "3.3" install: diff --git a/commcare_export/cli.py b/commcare_export/cli.py index 2d45689a..e463e324 100644 --- a/commcare_export/cli.py +++ b/commcare_export/cli.py @@ -50,6 +50,7 @@ def main(argv): parser.add_argument('--verbose', default=False, action='store_true') parser.add_argument('--output-format', default='json', choices=['json', 'csv', 'xls', 'xlsx', 'sql', 'markdown'], help='Output format') parser.add_argument('--output', metavar='PATH', default='reports.zip', help='Path to output; defaults to `reports.zip`.') + parser.add_argument('--strict-types', default=False, action='store_true', help="When saving to a SQL database don't allow changing column types once they are created.") args = parser.parse_args(argv) @@ -137,7 +138,7 @@ def main_with_args(args): # Writer had bizarre issues so we use a full connection instead of passing in a URL or engine import sqlalchemy engine = sqlalchemy.create_engine(args.output) - writer = writers.SqlTableWriter(engine.connect()) + writer = writers.SqlTableWriter(engine.connect(), args.strict_types) if not args.since and not args.start_over and os.path.exists(args.query): connection = sqlalchemy.create_engine(args.output) diff --git a/commcare_export/env.py b/commcare_export/env.py index fb588ea0..c9d8b219 100644 --- a/commcare_export/env.py +++ b/commcare_export/env.py @@ -5,6 +5,7 @@ from jsonpath_rw import jsonpath from jsonpath_rw.parser import parse as parse_jsonpath +from commcare_export.misc import unwrap from commcare_export.repeatable_iterator import RepeatableIterator @@ -217,6 +218,36 @@ def emitted_tables(self): # Actual concrete environments, basically with built-in functions. # +@unwrap +def str2bool(val): + if isinstance(val, bool): + return val + return val and str(val).lower() in {'true', 't', '1'} + +@unwrap +def str2num(val): + if val is None: + return None + + try: + return int(val) + except ValueError: + return float(val) + + +@unwrap +def str2date(val): + import dateutil.parser as parser + if not val: + return None + return parser.parse(val) + + +@unwrap +def bool2int(val): + return int(str2bool(val)) + + class BuiltInEnv(DictEnv): """ A built-in environment of operators and functions @@ -241,6 +272,10 @@ def __init__(self): '<=' : operator.__le__, 'len' : len, 'bool': bool, + 'str2bool': str2bool, + 'bool2int': bool2int, + 'str2num': str2num, + 'str2date': str2date, }) def bind(self, name, value): raise CannotBind() diff --git a/commcare_export/minilinq.py b/commcare_export/minilinq.py index dba96b76..bb538f53 100644 --- a/commcare_export/minilinq.py +++ b/commcare_export/minilinq.py @@ -7,6 +7,7 @@ from jsonpath_rw import jsonpath from jsonpath_rw.parser import parse as parse_jsonpath +from commcare_export.misc import unwrap from commcare_export.repeatable_iterator import RepeatableIterator @@ -377,21 +378,12 @@ def __init__(self, table, headings, source): self.headings = headings self.source = source + @unwrap def coerce_cell_blithely(self, cell): - if isinstance(cell, jsonpath.DatumInContext): - cell = cell.value - - if isinstance(cell, six.string_types): + if isinstance(cell, list): + return ','.join([self.coerce_cell(item) for item in cell]) + else: return cell - elif isinstance(cell, int): - return str(cell) - elif isinstance(cell, datetime): - return cell - elif cell is None: - return '' - - # In all other cases, coerce to a list and join with ',' for now - return ','.join([self.coerce_cell(item) for item in list(cell)]) def coerce_cell(self, cell): try: diff --git a/commcare_export/misc.py b/commcare_export/misc.py index ed7bc243..6ea73b66 100644 --- a/commcare_export/misc.py +++ b/commcare_export/misc.py @@ -1,7 +1,10 @@ from __future__ import unicode_literals, print_function, absolute_import, division, generators, nested_scopes +import functools import hashlib import io -import json +from jsonpath_rw import jsonpath +from commcare_export.repeatable_iterator import RepeatableIterator + def digest_file(path): with io.open(path, 'rb') as filehandle: @@ -12,3 +15,27 @@ def digest_file(path): break digest.update(chunk) return digest.hexdigest() + + +def unwrap(fn): + @functools.wraps(fn) + def _inner(*args): + # handle case when fn is a class method and first arg is 'self' + val = args[1] if len(args) == 2 else args[0] + + if isinstance(val, RepeatableIterator): + val = list(val) + + if isinstance(val, list): + if len(val) == 1: + val = val[0] + else: + val = map(_inner, val) + + if isinstance(val, jsonpath.DatumInContext): + val = val.value + + # call fn with 'self' if necessary + return fn(*([val] if len(args) == 1 else [args[0], val])) + + return _inner diff --git a/commcare_export/writers.py b/commcare_export/writers.py index 85d82010..c1e82e14 100644 --- a/commcare_export/writers.py +++ b/commcare_export/writers.py @@ -9,6 +9,7 @@ from six import StringIO, u from itertools import chain +import datetime logger = logging.getLogger(__name__) @@ -19,6 +20,20 @@ def ensure_text(v): return v elif isinstance(v, six.binary_type): return u(v) + elif isinstance(v, datetime.datetime): + return v.strftime('%Y-%m-%d %H:%M:%S') + elif isinstance(v, datetime.date): + return v.isoformat() + elif v is None: + return '' + else: + return u(str(v)) + +def to_jvalue(v): + if isinstance(v, (six.text_type,) + six.integer_types): + return v + elif isinstance(v, six.binary_type): + return u(v) else: return u(str(v)) @@ -141,7 +156,7 @@ def write_table(self, table): # Ensures the table is iterable; probably better to create a custom JSON handler that runs in constant space self.tables.append(dict(name=table['name'], headings=list(table['headings']), - rows=[list(row) for row in table['rows']])) + rows=[[to_jvalue(v) for v in row] for row in table['rows']])) class StreamingMarkdownTableWriter(TableWriter): """ @@ -156,7 +171,7 @@ def write_table(self, table): self.output_stream.write('|%s|\n' % '|'.join(table['headings'])) for row in table['rows']: - self.output_stream.write('|%s|\n' % '|'.join(row)) + self.output_stream.write('|%s|\n' % '|'.join(ensure_text(val) for val in row)) class SqlTableWriter(TableWriter): """ @@ -167,7 +182,7 @@ class SqlTableWriter(TableWriter): MIN_VARCHAR_LEN=32 # Since SQLite does not actually support ALTER COLUMN type, let's maximize the chance that we do not have to write workarounds by starting medium MAX_VARCHAR_LEN=255 # Arbitrary point at which we switch to TEXT; for postgres VARCHAR == TEXT anyhow and for Sqlite it doesn't matter either - def __init__(self, connection): + def __init__(self, connection, strict_types=False): try: import sqlalchemy import alembic @@ -179,6 +194,7 @@ def __init__(self, connection): "command: pip install sqlalchemy alembic") self.base_connection = connection + self.strict_types = strict_types def __enter__(self): self.connection = self.base_connection.connect() # "forks" the SqlAlchemy connection @@ -199,10 +215,22 @@ def metadata(self): self._metadata.reflect() return self._metadata + @property + def is_sqllite(self): + return 'sqlite' in self.connection.engine.driver + def table(self, table_name): return self.sqlalchemy.Table(table_name, self.metadata, autoload=True, autoload_with=self.connection) def best_type_for(self, val): + if not self.is_sqllite: + if isinstance(val, bool): + return self.sqlalchemy.Boolean() + elif isinstance(val, datetime.datetime): + return self.sqlalchemy.DateTime() + elif isinstance(val, datetime.date): + return self.sqlalchemy.Date() + if isinstance(val, int): return self.sqlalchemy.Integer() elif isinstance(val, six.string_types): @@ -225,15 +253,9 @@ def compatible(self, source_type, dest_type): """ Checks _coercion_ compatibility. """ - - # FIXME: Add datetime and friends - if isinstance(source_type, self.sqlalchemy.Integer): - # Integers can be cast to varch - return True - if isinstance(source_type, self.sqlalchemy.String): if not isinstance(dest_type, self.sqlalchemy.String): - False + return False elif source_type.length is None: # The length being None means that we are looking at indefinite strings aka TEXT. # This tool will never create strings with bounds, but if a target DB has one then @@ -243,6 +265,18 @@ def compatible(self, source_type, dest_type): else: return (dest_type.length >= source_type.length) + compatibility = { + self.sqlalchemy.String: (self.sqlalchemy.Text,), + self.sqlalchemy.Integer: (self.sqlalchemy.String, self.sqlalchemy.Text), + self.sqlalchemy.Boolean: (self.sqlalchemy.String, self.sqlalchemy.Text), + self.sqlalchemy.DateTime: (self.sqlalchemy.String, self.sqlalchemy.Text, self.sqlalchemy.Date), + self.sqlalchemy.Date: (self.sqlalchemy.String, self.sqlalchemy.Text), + } + for _type, types in compatibility.items(): + if isinstance(source_type, _type): + return isinstance(dest_type, (_type,) + types) + + def least_upper_bound(self, source_type, dest_type): """ Returns the _coercion_ least uppper bound. @@ -268,25 +302,42 @@ def make_table_compatible(self, table_name, row_dict): op.create_table(table_name, id_column) self.metadata.reflect() + def get_cols(): + return {c.name: c for c in self.table(table_name).columns} + + columns = get_cols() + for column, val in row_dict.items(): ty = self.best_type_for(val) - - if not column in [c.name for c in self.table(table_name).columns]: + if not column in columns: # If we are creating the column, a None crashes things even though it is the "empty" type # but SQL does not have such a type. So we have to guess a liberal type for future use. ty = ty or self.sqlalchemy.UnicodeText() op.add_column(table_name, self.sqlalchemy.Column(column, ty, nullable=True)) self.metadata.clear() self.metadata.reflect() - + columns = get_cols() else: - columns = dict([(c.name, c) for c in self.table(table_name).columns]) + if val is None: + continue + current_ty = columns[column].type - if not self.compatible(ty, current_ty) and not ('sqlite' in self.connection.engine.driver): - op.alter_column(table_name, column, type_ = self.least_upper_bound(current_ty, ty)) + if not self.compatible(ty, current_ty): + new_type = self.least_upper_bound(ty, current_ty) + if self.strict_types: + logger.warn('Type mismatch detected for column %s (%s != %s) ' + 'but strict types in use.', columns[column], current_ty, new_type) + continue + if self.is_sqllite: + logger.warn('Type mismatch detected for column %s (%s != %s) ' + 'but sqlite does not support changing column types', columns[column], current_ty, new_type) + continue + logger.warn('Altering column %s from %s to %s', columns[column], current_ty, new_type) + op.alter_column(table_name, column, type_=new_type) self.metadata.clear() self.metadata.reflect() + columns = get_cols() def upsert(self, table, row_dict): diff --git a/setup.py b/setup.py index d39636a4..320427d7 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,7 @@ def run_tests(self): 'alembic', 'argparse', 'jsonpath-rw>=1.2.1', - 'openpyxl>=2.0.3', + 'openpyxl<2.1.0', 'python-dateutil', 'requests', 'ndg-httpsclient', diff --git a/tests/test_minilinq.py b/tests/test_minilinq.py index 34a1b77f..c79bbd4b 100644 --- a/tests/test_minilinq.py +++ b/tests/test_minilinq.py @@ -54,6 +54,18 @@ def test_eval_collapsed_list(self): assert Apply(Reference("*"), Literal(2), Literal(3)).eval(env) == 6 assert Apply(Reference(">"), Literal(56), Literal(23.5)).eval(env) == True assert Apply(Reference("len"), Literal([1, 2, 3])).eval(env) == 3 + assert Apply(Reference("bool"), Literal('a')).eval(env) == True + assert Apply(Reference("bool"), Literal('')).eval(env) == False + assert Apply(Reference("str2bool"), Literal('true')).eval(env) == True + assert Apply(Reference("str2bool"), Literal('t')).eval(env) == True + assert Apply(Reference("str2bool"), Literal('1')).eval(env) == True + assert Apply(Reference("str2bool"), Literal('0')).eval(env) == False + assert Apply(Reference("str2bool"), Literal('false')).eval(env) == False + assert Apply(Reference("str2num"), Literal('10')).eval(env) == 10 + assert Apply(Reference("str2num"), Literal('10.56')).eval(env) == 10.56 + assert Apply(Reference("str2date"), Literal('2015-01-01')).eval(env) == datetime(2015, 1, 1) + assert Apply(Reference("str2date"), Literal('2015-01-01T18:32:57')).eval(env) == datetime(2015, 1, 1, 18, 32, 57) + assert Apply(Reference("str2date"), Literal('2015-01-01T18:32:57.001200')).eval(env) == datetime(2015, 1, 1, 18, 32, 57, 1200) def test_map(self): env = BuiltInEnv() | DictEnv({}) @@ -90,14 +102,14 @@ def test_flatmap(self): pass def test_emit(self): - env = BuiltInEnv() | JsonPathEnv({'foo': {'baz': 3}}) + env = BuiltInEnv() | JsonPathEnv({'foo': {'baz': 3, 'bar': True}}) Emit(table='Foo', headings=[Literal('foo')], source=List([ - List([ Reference('foo.baz') ]) + List([ Reference('foo.baz'), Reference('foo.bar') ]) ])).eval(env) - assert list(list(env.emitted_tables())[0]['rows']) == [['3']] + assert list(list(env.emitted_tables())[0]['rows']) == [[3, True]] def test_from_jvalue(self): assert MiniLinq.from_jvalue({"Ref": "form.log_subreport"}) == Reference("form.log_subreport") diff --git a/tests/test_writers.py b/tests/test_writers.py index d99aca73..825b5bcf 100644 --- a/tests/test_writers.py +++ b/tests/test_writers.py @@ -6,9 +6,14 @@ import openpyxl import sqlalchemy +import datetime from commcare_export.writers import * +MYSQL_TYPE_MAP = { + bool: lambda x: str(int(x)) +} + class TestWriters(unittest.TestCase): SUPERUSER_POSTGRES_URL = 'postgresql://postgres@/postgres' @@ -70,23 +75,37 @@ def teardown_class(cls): conn.execute('rollback') conn.execute('drop database if exists %s' % cls.TEST_MYSQL_DB) + def _type_convert(self, connection, row): + """ + Different databases store and return values differently so convert the values + in the expected row to match the DB. + """ + def convert(type_map, value): + func = type_map.get(value.__class__, None) + return func(value) if func else value + + if 'mysql' in connection.engine.driver: + return {k: convert(MYSQL_TYPE_MAP, v) for k, v in row.items()} + + return row + def test_JValueTableWriter(self): writer = JValueTableWriter() writer.write_table({ 'name': 'foo', - 'headings': ['a', 'bjørn', 'c'], + 'headings': ['a', 'bjørn', 'c', 'd'], 'rows': [ - [1, '2', 3], - [4, '日本', 6], + [1, '2', 3, datetime.date(2015, 1, 1)], + [4, '日本', 6, datetime.date(2015, 1, 2)], ] }) assert writer.tables == [{ 'name': 'foo', - 'headings': ['a', 'bjørn', 'c'], + 'headings': ['a', 'bjørn', 'c', 'd'], 'rows': [ - [1, '2', 3], - [4, '日本', 6], + [1, '2', 3, '2015-01-01'], + [4, '日本', 6, '2015-01-02'], ], }] @@ -180,41 +199,48 @@ def SqlWriter_upsert_tests(self, connection): assert dict(result['bizzle']) == {'id': 'bizzle', 'a': 7, 'b': '本', 'c': 9} assert dict(result['bazzle']) == {'id': 'bazzle', 'a': 4, 'b': '日本', 'c': 6} - def SqlWriter_fancy_tests(self, connection): + def SqlWriter_types_test(self, connection, table_name=None): + table_name = table_name or 'foo_fancy_types' writer = SqlTableWriter(connection) with writer: writer.write_table({ - 'name': 'foo_fancy', - 'headings': ['id', 'a', 'b', 'c'], + 'name': table_name or 'foo_fancy_types', + 'headings': ['id', 'a', 'b', 'c', 'd', 'e'], 'rows': [ - ['bizzle', 1, 'yo', 3], - ['bazzle', 4, '日本', 6], + ['bizzle', 1, 'yo', True, datetime.date(2015, 1, 1), datetime.datetime(2014, 4, 2, 18, 56, 12)], + ['bazzle', 4, '日本', False, datetime.date(2015, 1, 2), datetime.datetime(2014, 5, 1, 11, 16, 45)], ] }) - + # We can use raw SQL instead of SqlAlchemy expressions because we built the DB above - result = dict([(row['id'], row) for row in connection.execute('SELECT id, a, b, c FROM foo_fancy')]) - + result = dict([(row['id'], row) for row in connection.execute('SELECT id, a, b, c, d, e FROM %s' % table_name)]) + assert len(result) == 2 - assert dict(result['bizzle']) == {'id': 'bizzle', 'a': 1, 'b': 'yo', 'c': 3} - assert dict(result['bazzle']) == {'id': 'bazzle', 'a': 4, 'b': '日本', 'c': 6} + assert dict(result['bizzle']) == self._type_convert(connection, {'id': 'bizzle', 'a': 1, 'b': 'yo', 'c': True, + 'd': datetime.date(2015, 1, 1), 'e': datetime.datetime(2014, 4, 2, 18, 56, 12)}) + assert dict(result['bazzle']) == self._type_convert(connection, {'id': 'bazzle', 'a': 4, 'b': '日本', 'c': False, + 'd': datetime.date(2015, 1, 2), 'e': datetime.datetime(2014, 5, 1, 11, 16, 45)}) + + def SqlWriter_change_type_test(self, connection, expected): + self.SqlWriter_types_test(connection, 'foo_fancy_type_changes') writer = SqlTableWriter(connection) with writer: writer.write_table({ - 'name': 'foo_fancy', - 'headings': ['id', 'a', 'b', 'c'], + 'name': 'foo_fancy_type_changes', + 'headings': ['id', 'a', 'b', 'c', 'd', 'e'], 'rows': [ - ['bizzle', 'yo dude', '本', 9], + ['bizzle', 'yo dude', '本', 5, datetime.datetime(2015, 2, 13), '2014-08-01T11:23:45:00.0000Z'], ] }) # We can use raw SQL instead of SqlAlchemy expressions because we built the DB above - result = dict([(row['id'], row) for row in connection.execute('SELECT id, a, b, c FROM foo_fancy')]) + result = dict([(row['id'], row) for row in connection.execute('SELECT id, a, b, c, d, e FROM foo_fancy_type_changes')]) assert len(result) == 2 - assert dict(result['bizzle']) == {'id': 'bizzle', 'a': 'yo dude', 'b': '本', 'c': 9} - assert dict(result['bazzle']) == {'id': 'bazzle', 'a': '4', 'b': '日本', 'c': 6} + for id, row in result.items(): + assert id in expected + assert dict(row) == expected[id] def test_postgres_insert(self): with self.postgres_engine.connect() as conn: @@ -241,18 +267,46 @@ def test_sqlite_upsert(self): with self.sqlite_engine.connect() as conn: self.SqlWriter_upsert_tests(conn) - def test_postgres_fancy(self): + def test_postgres_type_changes(self): + ''' + These tests cannot be accomplished with Sqlite because it does not support these + core features such as column type changes + ''' + expected = { + 'bizzle': {'id': 'bizzle', 'a': 'yo dude', 'b': '本', 'c': '5', + 'd': datetime.date(2015, 2, 13), 'e': '2014-08-01T11:23:45:00.0000Z'}, + 'bazzle': {'id': 'bazzle', 'a': '4', 'b': '日本', 'c': 'false', + 'd': datetime.date(2015, 1, 2), 'e': '2014-05-01 11:16:45'} + } + with self.postgres_engine.connect() as conn: + self.SqlWriter_change_type_test(conn, expected) + + def test_postgres_types(self): ''' These tests cannot be accomplished with Sqlite because it does not support these core features such as column type changes ''' with self.postgres_engine.connect() as conn: - self.SqlWriter_fancy_tests(conn) + self.SqlWriter_types_test(conn) + + def test_mysql_type_changes(self): + ''' + These tests cannot be accomplished with Sqlite because it does not support these + core features such as column type changes + ''' + expected = { + 'bizzle': {'id': 'bizzle', 'a': 'yo dude', 'b': '本', 'c': '5', + 'd': datetime.date(2015, 2, 13), 'e': '2014-08-01T11:23:45:00.0000Z'}, + 'bazzle': {'id': 'bazzle', 'a': '4', 'b': '日本', 'c': '0', + 'd': datetime.date(2015, 1, 2), 'e': '2014-05-01 11:16:45'} + } + with self.mysql_engine.connect() as conn: + self.SqlWriter_change_type_test(conn, expected) - def test_mysql_fancy(self): + def test_mysql_types(self): ''' These tests cannot be accomplished with Sqlite because it does not support these core features such as column type changes ''' with self.mysql_engine.connect() as conn: - self.SqlWriter_fancy_tests(conn) + self.SqlWriter_types_test(conn)