Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove user data from couch user model #34805

Merged
merged 11 commits into from
Jun 27, 2024
2 changes: 1 addition & 1 deletion corehq/apps/api/resources/v0_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class Meta(CustomResourceMeta):

class CommCareUserResource(UserResource):
groups = fields.ListField(attribute='get_group_ids')
user_data = fields.DictField(attribute='user_data')
user_data = fields.DictField()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Won't the API still be trying to update the user_data attribute on the CommCareUser model? Or is that somehow otherwise accounted for?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, good question. That's handled here:

def _update_user_data(user, new_user_data, user_change_logger):
try:
profile_id = new_user_data.pop(PROFILE_SLUG, ...)
changed = user.get_user_data(user.domain).update(new_user_data, profile_id=profile_id)
except UserDataError as e:
raise UpdateUserException(str(e))
if user_change_logger and changed:
user_change_logger.add_changes({
'user_data': user.get_user_data(user.domain).raw
})

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah I see, obj_update should manually override the update for this field.


class Meta(UserResource.Meta):
authentication = RequirePermissionAuthentication(HqPermissions.edit_commcare_users)
Expand Down
2 changes: 1 addition & 1 deletion corehq/apps/api/user_updates.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def _update_user_data(user, new_user_data, user_change_logger):
if user_change_logger and changed:
user_change_logger.add_changes({
'user_data': user.get_user_data(user.domain).raw
})
}, skip_confirmation=True)


def _update_user_role(user, role, user_change_logger):
Expand Down
23 changes: 23 additions & 0 deletions corehq/apps/domain_migration_flags/api.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
from collections import defaultdict
from datetime import datetime
from functools import wraps

from django.db.models import Q

Expand Down Expand Up @@ -126,3 +127,25 @@ def all_domains_with_migrations_in_progress():
def reset_caches(domain, slug):
any_migrations_in_progress(domain, strict=True)
get_migration_status(domain, slug, strict=True)


def once_off_migration(slug):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this.

"""Ensure that the body of a migration is run only once

You can still run it again if there is an exception
"""
def outer(migration_fn):
@wraps(migration_fn)
def inner(*args, **kwargs):
if get_migration_complete(ALL_DOMAINS, slug):
return
set_migration_started(ALL_DOMAINS, slug)
try:
res = migration_fn(*args, **kwargs)
except Exception:
set_migration_not_started(ALL_DOMAINS, slug)
raise
set_migration_complete(ALL_DOMAINS, slug)
return res
return inner
return outer
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import uuid
from datetime import datetime
from unittest.mock import Mock

from django.test import TestCase

from corehq.apps.domain_migration_flags.api import (
ALL_DOMAINS,
any_migrations_in_progress,
get_migration_complete,
get_migration_status,
migration_in_progress,
once_off_migration,
set_migration_complete,
set_migration_not_started,
set_migration_started,
Expand All @@ -20,10 +23,8 @@

class DomainMigrationProgressTest(TestCase):

@classmethod
def setUpClass(cls):
super(DomainMigrationProgressTest, cls).setUpClass()
cls.slug = uuid.uuid4().hex
def setUp(self):
self.slug = uuid.uuid4().hex

def get_progress(self, domain):
return DomainMigrationProgress.objects.get(domain=domain, migration_slug=self.slug)
Expand Down Expand Up @@ -109,3 +110,42 @@ def test_any_migrations_in_progress(self):
self.assertTrue(any_migrations_in_progress('purple'))
set_migration_complete('purple', 'other_slug')
self.assertFalse(any_migrations_in_progress('purple'))

def test_once_off_decorator(self):
actual_migration = Mock()

@once_off_migration(self.slug)
def basic_migration():
self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.IN_PROGRESS)
actual_migration()

self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.NOT_STARTED)
actual_migration.assert_not_called()

basic_migration()
self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.COMPLETE)
actual_migration.assert_called_once()

basic_migration()
actual_migration.assert_called_once()

def test_once_off_decorator_failure(self):
actual_migration = Mock()

@once_off_migration(self.slug)
def failing_migration():
self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.IN_PROGRESS)
actual_migration()
raise ValueError('this migration failed')

self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.NOT_STARTED)
self.assertEqual(actual_migration.call_count, 0)

with self.assertRaises(ValueError):
failing_migration()
self.assertEqual(get_migration_status(ALL_DOMAINS, self.slug), MigrationStatus.NOT_STARTED)
self.assertEqual(actual_migration.call_count, 1)

with self.assertRaises(ValueError):
failing_migration()
self.assertEqual(actual_migration.call_count, 2)
33 changes: 14 additions & 19 deletions corehq/apps/geospatial/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import jsonschema
from memoized import memoized

from dimagi.utils.couch.bulk import get_docs
from dimagi.utils.couch.database import iter_docs
from dimagi.utils.web import json_request, json_response

Expand Down Expand Up @@ -367,32 +366,28 @@ def get_paginated_cases_without_gps(self, domain, page, limit):

def _get_paginated_users_without_gps(domain, page, limit, query):
location_prop_name = get_geo_user_property(domain)
query = (
res = (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@zandre-eng What do you think of this change?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Apologies for the late response on this as I was away on PTO. The changes look good, and it's nice that we're able to handle pagination now directly in the ES query.

UserES()
.domain(domain)
.mobile_users()
.missing_or_empty_user_data_property(location_prop_name)
.search_string_query(query, ['username'])
.fields(['_id', 'username'])
.sort('created_on', desc=True)
.start((page - 1) * limit)
.size(limit)
.run()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Man was this pulling all the users every function call when you could just pass the page directly to ES? 💀

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep. And then fetching the users again from couch. It does the same thing in the function below this one too - I just didn't want to get carried away with cleanup.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice, I see tests cover this

)

paginator = Paginator(query.get_ids(), limit)
user_ids_page = list(paginator.get_page(page))
user_docs = get_docs(CommCareUser.get_db(), keys=user_ids_page)
user_data = []
for user_doc in user_docs:
lat, lon = get_lat_lon_from_dict(user_doc['user_data'], location_prop_name)
user_data.append(
{
'id': user_doc['_id'],
'name': user_doc['username'].split('@')[0],
'lat': lat,
'lon': lon,
}
)
return {
'items': user_data,
'total': paginator.count,
'items': [
{
'id': hit['_id'],
'name': hit['username'].split('@')[0],
'lat': '',
'lon': '',
} for hit in res.hits
],
'total': res.total,
}


Expand Down
18 changes: 7 additions & 11 deletions corehq/apps/user_importer/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,15 @@ def __init__(self, upload_domain, user_domain, user, is_new_user, changed_by_use

if not is_new_user:
self.original_user_doc = self.user.to_json()
self.original_user_data = self.user.get_user_data(user_domain).raw
else:
self.original_user_doc = None
self.original_user_data = None

self.fields_changed = {}
self.change_messages = {}

self._save = False # flag to check if log needs to be saved for updates

def add_changes(self, changes):
def add_changes(self, changes, skip_confirmation=False):
"""
Add changes to user properties.
Ignored for new user since the whole user doc is logged for a new user
Expand All @@ -60,7 +58,7 @@ def add_changes(self, changes):
if self.is_new_user:
return
for name, new_value in changes.items():
if self.original_user_doc[name] != new_value:
if skip_confirmation or self.original_user_doc[name] != new_value:
self.fields_changed[name] = new_value
self._save = True

Expand Down Expand Up @@ -157,6 +155,8 @@ def update_user_data(self, data, uncategorized_data, profile_name, profiles_by_n
from corehq.apps.users.user_data import UserDataError
user_data = self.user.get_user_data(self.user_domain)
old_profile_id = user_data.profile_id
old_user_data = user_data.raw

if PROFILE_SLUG in data:
raise UserUploadError(_("You cannot set {} directly").format(PROFILE_SLUG))
if profile_name:
Expand All @@ -174,20 +174,16 @@ def update_user_data(self, data, uncategorized_data, profile_name, profiles_by_n
if user_data.profile_id and user_data.profile_id != old_profile_id:
self.logger.add_info(UserChangeMessage.profile_info(user_data.profile_id, profile_name))

if old_user_data != user_data.raw:
self.logger.add_changes({'user_data': user_data.raw}, skip_confirmation=True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand we did that before too. But is logging the user data a good idea?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate? This code logs all changes made to a user for auditing purposes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it is just the name of the field being user_data and in actuality it does not contain anything sensitive. But I wonder if there was a chance something that we don't want logged ends up in there. I think it would not be clear to every developer changing what goes in user_data that it will also end up in the logs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

true - though logger here actually isn't a normal python logger, it's an instance of UserChangeLogger, which saves details of changes to postgres to power an auditing report.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fair


def save_log(self):
# Tracking for role is done post save to have role setup correctly on save
if self.role_updated:
new_role = self.user.get_role(domain=self.user_domain)
self.logger.add_info(UserChangeMessage.role_change(new_role))

self._include_user_data_changes()
return self.logger.save()

def _include_user_data_changes(self):
new_user_data = self.user.get_user_data(self.user_domain).raw
if self.logger.original_user_data != new_user_data:
self.logger.add_changes({'user_data': new_user_data})


class CommCareUserImporter(BaseUserImporter):

Expand Down
10 changes: 6 additions & 4 deletions corehq/apps/userreports/expressions/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -600,19 +600,21 @@ def __call__(self, item, evaluation_context=None):
@staticmethod
@ucr_context_cache(vary_on=('related_doc_type', 'doc_id',))
def _get_document(related_doc_type, doc_id, evaluation_context):
domain = evaluation_context.root_doc['domain']
assert domain
document_store = get_document_store_for_doc_type(
evaluation_context.root_doc['domain'], related_doc_type,
load_source="related_doc_expression")
domain, related_doc_type, load_source="related_doc_expression")
try:
doc = document_store.get_document(doc_id)
except DocumentNotFoundError:
return None
if evaluation_context.root_doc['domain'] != doc.get('domain'):
if domain != doc.get('domain'):
return None
if related_doc_type == 'CommCareUser':
doc['user_data'] = CommCareUser.wrap(doc).get_user_data(domain).to_dict()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess user data just hasn't been part of UCRs since the migration?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do these expressions not apply to WebUsers too?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess user data just hasn't been part of UCRs since the migration?

That's right - this has been broken since the migration.

You can't do a related doc lookup on a web user, since they don't have a top level domain attribute. For the same reason, they also don't have a user_data attribute (since it varies from one domain to the next), though I guess we could inject one in this context.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For what it's worth, this means of directly accessing the user_data from the raw JSON has been broken to some degree for years - user data is only supposed to be accessed via helper methods, as fields controlled by the profile aren't stored directly in the user document. That means that the profile wasn't accounted for in the API or in UCR. I just broke it further 😆

return doc

def get_value(self, doc_id, evaluation_context):
assert evaluation_context.root_doc['domain']
doc = self._get_document(self.related_doc_type, doc_id, evaluation_context)
# explicitly use a new evaluation context since this is a new document
return self._value_expression(doc, EvaluationContext(doc, 0))
Expand Down
19 changes: 19 additions & 0 deletions corehq/apps/userreports/tests/test_expressions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,25 @@ def test_other_lookups(self):
doc = self._get_doc(user_id)
self.assertEqual(user_id, expression(doc, EvaluationContext(doc, 0)))

def test_user_data_lookup(self):
user = CommCareUser.create(self.domain, 'username', "123", None, None,
user_data={'favorite_color': 'indigo'})
self.addCleanup(user.delete, None, None)
expression = ExpressionFactory.from_spec({
"type": "related_doc",
"related_doc_type": 'CommCareUser',
"doc_id_expression": {
"type": "property_name",
"property_name": "related_id"
},
"value_expression": {
"type": "property_path",
"property_path": ["user_data", "favorite_color"],
},
})
doc = self._get_doc(user._id)
self.assertEqual('indigo', expression(doc, EvaluationContext(doc, 0)))

@staticmethod
def _get_expression(doc_type):
return ExpressionFactory.from_spec({
Expand Down
49 changes: 0 additions & 49 deletions corehq/apps/users/management/commands/populate_sql_user_data.py

This file was deleted.

36 changes: 36 additions & 0 deletions corehq/apps/users/management/commands/rm_couch_user_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# One-off migration, June 2024
from itertools import chain

from django.core.management.base import BaseCommand

from corehq.apps.domain_migration_flags.api import once_off_migration
from corehq.apps.users.models import CommCareUser
from corehq.dbaccessors.couchapps.all_docs import (
get_doc_count_by_type,
iter_all_doc_ids,
)
from corehq.util.couch import DocUpdate, iter_update
from corehq.util.log import with_progress_bar


class Command(BaseCommand):
help = "Remove now-stale CouchUser.user_data field"

def handle(self, *args, **options):
do_migration()


@once_off_migration("rm_couch_user_data")
def do_migration():
db = CommCareUser.get_db()
count = (get_doc_count_by_type(db, 'WebUser')
+ get_doc_count_by_type(db, 'CommCareUser'))
all_ids = chain(iter_all_doc_ids(db, 'WebUser'),
iter_all_doc_ids(db, 'CommCareUser'))
iter_update(db, _update_user, with_progress_bar(all_ids, count), verbose=True)


def _update_user(user_doc):
couch_data = user_doc.pop('user_data', ...)
if couch_data is not ...:
return DocUpdate(user_doc)
Loading
Loading