From 6f0a0bf0196ae56f18dea246b663a94398abde28 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Mon, 8 Jun 2026 05:08:26 -0400 Subject: [PATCH 01/15] feat(chord): validate translation constraints on create/update Reject translations where primary_contact or stakeholder roles differ from the canonical dataset, or where any list field gains items. --- chord_metadata_service/chord/serializers.py | 59 ++++++ .../chord/tests/test_api_translations.py | 171 +++++++++++++++++- 2 files changed, 229 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index d1d78799a..c5ae87c3d 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -126,6 +126,60 @@ def update(self, instance, validated_data): return instance +def _roles_for(contact) -> list: + return getattr(contact, "roles", []) or [] + + +def _check_translation_constraints(translation: ProjectScopedDatasetModel): + """ + Validate two invariants that translations must respect vs. the canonical (English) dataset: + 1. roles on primary_contact and each stakeholder cannot change. + 2. no list field may gain items (translations can omit or keep, not add). + """ + try: + dataset = Dataset.objects.get(identifier=str(translation.identifier)) + except Dataset.DoesNotExist: + return + + canonical = dataset.to_schema() + errors = {} + + # Rule 1: roles immutable + c_roles = _roles_for(canonical.primary_contact) + t_roles = _roles_for(translation.primary_contact) + if c_roles != t_roles: + errors["primary_contact"] = ["Roles cannot change in a translation."] + + c_stakeholders = canonical.stakeholders or [] + t_stakeholders = translation.stakeholders or [] + for i, (c_sh, t_sh) in enumerate(zip(c_stakeholders, t_stakeholders)): + if _roles_for(c_sh) != _roles_for(t_sh): + errors.setdefault("stakeholders", []).append( + f"Stakeholder at index {i}: roles cannot change in a translation." + ) + + # Rule 2: arrays cannot grow + _LIST_FIELDS = ( + "taxa", "keywords", "resources", "stakeholders", "counts", + "links", "publications", "logos", "participant_criteria", "domain", + ) + for field in _LIST_FIELDS: + c_val = getattr(canonical, field, None) or [] + t_val = getattr(translation, field, None) or [] + if isinstance(c_val, list) and isinstance(t_val, list) and len(t_val) > len(c_val): + errors[field] = [f"Translation cannot add items to '{field}' (canonical has {len(c_val)})."] + + c_fs = canonical.funding_sources + t_fs = translation.funding_sources + if isinstance(c_fs, list) and isinstance(t_fs, list) and len(t_fs) > len(c_fs): + errors["funding_sources"] = [ + f"Translation cannot add items to 'funding_sources' (canonical has {len(c_fs)})." + ] + + if errors: + raise serializers.ValidationError(errors) + + class DatasetTranslationSerializer(PydanticJSONBSerializer): schema_class = ProjectScopedDatasetModel @@ -134,6 +188,11 @@ class Meta: fields = "__all__" read_only_fields = ['created_at', 'updated_at', 'dataset'] + def to_internal_value(self, data): + result = super().to_internal_value(data) + _check_translation_constraints(self._validated_schema) + return result + def to_representation(self, instance): data = super().to_representation(instance) data['created_at'] = instance.created_at diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index ef12a3638..8bb56b343 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -5,7 +5,7 @@ from chord_metadata_service.authz.tests.helpers import AuthzAPITestCase from chord_metadata_service.chord.dataset_schema import KatsuDatasetModel -from chord_metadata_service.chord.models import DatasetTranslation +from chord_metadata_service.chord.models import Dataset, DatasetTranslation from chord_metadata_service.chord.tests.constants import VALID_DATASET_PRIMARY_CONTACT from chord_metadata_service.phenopackets.tests.helpers import PhenoTestCase @@ -173,3 +173,172 @@ def test_del_translation_forbidden(self): r = self.one_no_authz_delete(self._translation_url("ja")) self.assertEqual(r.status_code, status.HTTP_403_FORBIDDEN) self.assertTrue(DatasetTranslation.objects.filter(dataset=self.dataset, language="ja").exists()) + + +ROLE_PI = "Principal Investigator" +ROLE_RESEARCHER = "Researcher" + + +class DatasetTranslationValidationTest(AuthzAPITestCase, PhenoTestCase): + """Tests for translation validation: roles immutable, arrays cannot grow.""" + + def _make_dataset(self, primary_contact=None, **kwargs) -> Dataset: + contact = primary_contact or VALID_DATASET_PRIMARY_CONTACT + schema = KatsuDatasetModel( + schema_version="1.0", + title=f"Validation DS {uuid.uuid4().hex[:8]}", + description="Test", + primary_contact=contact, + identifier=str(uuid.uuid4()), + project=str(self.project.identifier), + **kwargs, + ) + ds = Dataset.from_schema(schema) + ds.save() + self.addCleanup(ds.delete) + return ds + + def _list_url(self, dataset: Dataset) -> str: + return reverse("dataset-translations-list", kwargs={"identifier": dataset.identifier}) + + def _detail_url(self, dataset: Dataset, language: str) -> str: + return reverse("dataset-translations-detail", kwargs={ + "identifier": dataset.identifier, + "language": language, + }) + + def _payload(self, dataset: Dataset, primary_contact=None, **kwargs) -> dict: + return { + "schema_version": "1.0", + "title": "Translated Title", + "description": "Translated description", + "primary_contact": primary_contact or VALID_DATASET_PRIMARY_CONTACT, + "identifier": str(dataset.identifier), + "project": str(self.project.identifier), + **kwargs, + } + + def _make_translation_in_db(self, dataset: Dataset, language: str, primary_contact=None, **kwargs) -> DatasetTranslation: + schema = KatsuDatasetModel( + schema_version="1.0", + title=f"Translation {language}", + description="Test", + primary_contact=primary_contact or VALID_DATASET_PRIMARY_CONTACT, + identifier=str(dataset.identifier), + project=str(self.project.identifier), + **kwargs, + ) + t = DatasetTranslation.from_schema(schema, dataset_id=dataset.identifier, language=language) + t.save() + self.addCleanup(t.delete) + return t + + # ---- Rule 1: roles immutable ---- + + def test_create_translation_primary_contact_role_change_rejected(self): + # canonical has ROLE_PI; translation sends empty roles → 400 + contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} + ds = self._make_dataset(primary_contact=contact_with_role) + payload = self._payload(ds, language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("primary_contact", r.json()) + + def test_create_translation_primary_contact_role_added_rejected(self): + # canonical has empty roles; translation adds ROLE_PI → 400 + ds = self._make_dataset() + payload = self._payload( + ds, + primary_contact={"type": "person", "name": "Test Contact", "roles": [ROLE_PI]}, + language="fr", + ) + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("primary_contact", r.json()) + + def test_create_translation_stakeholder_role_change_rejected(self): + # canonical stakeholder has ROLE_PI; translation sends ROLE_RESEARCHER → 400 + stakeholder = {"type": "person", "name": "Stakeholder", "roles": [ROLE_PI]} + ds = self._make_dataset(stakeholders=[stakeholder]) + payload = self._payload( + ds, + stakeholders=[{"type": "person", "name": "Stakeholder FR", "roles": [ROLE_RESEARCHER]}], + language="fr", + ) + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("stakeholders", r.json()) + + def test_update_translation_primary_contact_role_change_rejected(self): + # existing translation, PUT changes roles → 400 + contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} + ds = self._make_dataset(primary_contact=contact_with_role) + self._make_translation_in_db(ds, "es", primary_contact=contact_with_role) + payload = self._payload(ds, language="es") # roles=[] differs from canonical ROLE_PI + r = self.one_authz_put(self._detail_url(ds, "es"), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("primary_contact", r.json()) + + def test_create_translation_roles_same_ok(self): + # same roles → 201 + contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} + ds = self._make_dataset(primary_contact=contact_with_role) + payload = self._payload(ds, primary_contact=contact_with_role, language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) + + # ---- Rule 2: arrays cannot grow ---- + + def test_create_translation_keywords_grow_rejected(self): + # canonical has 1 keyword; translation sends 2 → 400 + ds = self._make_dataset(keywords=["cancer"]) + payload = self._payload(ds, keywords=["cancer", "genomics"], language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("keywords", r.json()) + + def test_create_translation_stakeholders_grow_rejected(self): + # canonical has 1 stakeholder; translation sends 2 → 400 + stakeholder = {"type": "person", "name": "A", "roles": [ROLE_PI]} + ds = self._make_dataset(stakeholders=[stakeholder]) + payload = self._payload( + ds, + stakeholders=[ + {"type": "person", "name": "A FR", "roles": [ROLE_PI]}, + {"type": "person", "name": "B FR", "roles": [ROLE_RESEARCHER]}, + ], + language="fr", + ) + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("stakeholders", r.json()) + + def test_update_translation_array_grows_rejected(self): + # existing translation, PUT adds keyword → 400 + ds = self._make_dataset(keywords=["cancer"]) + self._make_translation_in_db(ds, "de", keywords=["Krebs"]) + payload = self._payload(ds, keywords=["Krebs", "Genomik"], language="de") + r = self.one_authz_put(self._detail_url(ds, "de"), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("keywords", r.json()) + + def test_create_translation_array_same_size_ok(self): + # same number of keywords → 201 + ds = self._make_dataset(keywords=["cancer"]) + payload = self._payload(ds, keywords=["cancer FR"], language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) + + def test_create_translation_array_shrinks_ok(self): + # fewer keywords than canonical → 201 + ds = self._make_dataset(keywords=["cancer", "genomics"]) + payload = self._payload(ds, keywords=["cancer FR"], language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) + + def test_create_translation_no_arrays_ok(self): + # canonical has keywords, translation omits them entirely → 201 + ds = self._make_dataset(keywords=["cancer"]) + payload = self._payload(ds, language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) From a3f939199611939cc22dfc310fa9779bd5eff476 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Mon, 8 Jun 2026 05:42:05 -0400 Subject: [PATCH 02/15] style: fix E501 line too long in test_api_translations --- chord_metadata_service/chord/tests/test_api_translations.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index 8bb56b343..abdb292db 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -218,7 +218,9 @@ def _payload(self, dataset: Dataset, primary_contact=None, **kwargs) -> dict: **kwargs, } - def _make_translation_in_db(self, dataset: Dataset, language: str, primary_contact=None, **kwargs) -> DatasetTranslation: + def _make_translation_in_db( + self, dataset: Dataset, language: str, primary_contact=None, **kwargs + ) -> DatasetTranslation: schema = KatsuDatasetModel( schema_version="1.0", title=f"Translation {language}", From 8e9e57cc7f177d0d8927540a8e01eb4b3e722155 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Mon, 8 Jun 2026 06:09:20 -0400 Subject: [PATCH 03/15] feat(chord): infer identifier and project on translation from URL Clients no longer need to supply identifier or project in the translation request body; the serializer injects them from the dataset looked up via the URL identifier or the existing instance on update. --- chord_metadata_service/chord/api_views.py | 4 +-- chord_metadata_service/chord/serializers.py | 15 +++++++++++ .../chord/tests/test_api_translations.py | 25 +++++++------------ 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/chord_metadata_service/chord/api_views.py b/chord_metadata_service/chord/api_views.py index 39617f60a..113d58ea1 100644 --- a/chord_metadata_service/chord/api_views.py +++ b/chord_metadata_service/chord/api_views.py @@ -426,7 +426,7 @@ def _list(): )): return forbidden(request) authz.mark_authz_done(request) - serializer = DatasetTranslationSerializer(data=request.data) + serializer = DatasetTranslationSerializer(data=request.data, context=self.get_serializer_context()) def _create(): serializer.is_valid(raise_exception=True) @@ -460,7 +460,7 @@ async def translation_detail(self, request, language, **kwargs): if request.method == "DELETE": await translation.adelete() return Response(status=status.HTTP_204_NO_CONTENT) - serializer = DatasetTranslationSerializer(translation, data=request.data) + serializer = DatasetTranslationSerializer(translation, data=request.data, context=self.get_serializer_context()) def _update(): serializer.is_valid(raise_exception=True) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index c5ae87c3d..e9dbb34c2 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -188,7 +188,22 @@ class Meta: fields = "__all__" read_only_fields = ['created_at', 'updated_at', 'dataset'] + def _resolve_dataset_id(self) -> str | None: + if self.instance is not None: + return str(self.instance.dataset_id) + view = self.context.get("view") + if view is not None: + return view.kwargs.get("identifier") + return None + def to_internal_value(self, data): + dataset_id = self._resolve_dataset_id() + if dataset_id is not None: + try: + dataset = Dataset.objects.get(identifier=dataset_id) + data = {**data, "identifier": dataset_id, "project": str(dataset.project_id)} + except Dataset.DoesNotExist: + pass # view handles 404 result = super().to_internal_value(data) _check_translation_constraints(self._validated_schema) return result diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index abdb292db..7a8263069 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -28,8 +28,6 @@ def _translation_payload(self, title: str = "Test Translation") -> dict: "title": title, "description": "Test translation description", "primary_contact": VALID_DATASET_PRIMARY_CONTACT, - "identifier": str(self.dataset.identifier), - "project": str(self.project.identifier), } def _make_translation(self, language: str) -> DatasetTranslation: @@ -207,14 +205,12 @@ def _detail_url(self, dataset: Dataset, language: str) -> str: "language": language, }) - def _payload(self, dataset: Dataset, primary_contact=None, **kwargs) -> dict: + def _payload(self, primary_contact=None, **kwargs) -> dict: return { "schema_version": "1.0", "title": "Translated Title", "description": "Translated description", "primary_contact": primary_contact or VALID_DATASET_PRIMARY_CONTACT, - "identifier": str(dataset.identifier), - "project": str(self.project.identifier), **kwargs, } @@ -241,7 +237,7 @@ def test_create_translation_primary_contact_role_change_rejected(self): # canonical has ROLE_PI; translation sends empty roles → 400 contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} ds = self._make_dataset(primary_contact=contact_with_role) - payload = self._payload(ds, language="fr") + payload = self._payload(language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn("primary_contact", r.json()) @@ -250,7 +246,6 @@ def test_create_translation_primary_contact_role_added_rejected(self): # canonical has empty roles; translation adds ROLE_PI → 400 ds = self._make_dataset() payload = self._payload( - ds, primary_contact={"type": "person", "name": "Test Contact", "roles": [ROLE_PI]}, language="fr", ) @@ -263,7 +258,6 @@ def test_create_translation_stakeholder_role_change_rejected(self): stakeholder = {"type": "person", "name": "Stakeholder", "roles": [ROLE_PI]} ds = self._make_dataset(stakeholders=[stakeholder]) payload = self._payload( - ds, stakeholders=[{"type": "person", "name": "Stakeholder FR", "roles": [ROLE_RESEARCHER]}], language="fr", ) @@ -276,7 +270,7 @@ def test_update_translation_primary_contact_role_change_rejected(self): contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} ds = self._make_dataset(primary_contact=contact_with_role) self._make_translation_in_db(ds, "es", primary_contact=contact_with_role) - payload = self._payload(ds, language="es") # roles=[] differs from canonical ROLE_PI + payload = self._payload(language="es") # roles=[] differs from canonical ROLE_PI r = self.one_authz_put(self._detail_url(ds, "es"), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn("primary_contact", r.json()) @@ -285,7 +279,7 @@ def test_create_translation_roles_same_ok(self): # same roles → 201 contact_with_role = {"type": "person", "name": "Test Contact", "roles": [ROLE_PI]} ds = self._make_dataset(primary_contact=contact_with_role) - payload = self._payload(ds, primary_contact=contact_with_role, language="fr") + payload = self._payload(primary_contact=contact_with_role, language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_201_CREATED) @@ -294,7 +288,7 @@ def test_create_translation_roles_same_ok(self): def test_create_translation_keywords_grow_rejected(self): # canonical has 1 keyword; translation sends 2 → 400 ds = self._make_dataset(keywords=["cancer"]) - payload = self._payload(ds, keywords=["cancer", "genomics"], language="fr") + payload = self._payload(keywords=["cancer", "genomics"], language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn("keywords", r.json()) @@ -304,7 +298,6 @@ def test_create_translation_stakeholders_grow_rejected(self): stakeholder = {"type": "person", "name": "A", "roles": [ROLE_PI]} ds = self._make_dataset(stakeholders=[stakeholder]) payload = self._payload( - ds, stakeholders=[ {"type": "person", "name": "A FR", "roles": [ROLE_PI]}, {"type": "person", "name": "B FR", "roles": [ROLE_RESEARCHER]}, @@ -319,7 +312,7 @@ def test_update_translation_array_grows_rejected(self): # existing translation, PUT adds keyword → 400 ds = self._make_dataset(keywords=["cancer"]) self._make_translation_in_db(ds, "de", keywords=["Krebs"]) - payload = self._payload(ds, keywords=["Krebs", "Genomik"], language="de") + payload = self._payload(keywords=["Krebs", "Genomik"], language="de") r = self.one_authz_put(self._detail_url(ds, "de"), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn("keywords", r.json()) @@ -327,20 +320,20 @@ def test_update_translation_array_grows_rejected(self): def test_create_translation_array_same_size_ok(self): # same number of keywords → 201 ds = self._make_dataset(keywords=["cancer"]) - payload = self._payload(ds, keywords=["cancer FR"], language="fr") + payload = self._payload(keywords=["cancer FR"], language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_201_CREATED) def test_create_translation_array_shrinks_ok(self): # fewer keywords than canonical → 201 ds = self._make_dataset(keywords=["cancer", "genomics"]) - payload = self._payload(ds, keywords=["cancer FR"], language="fr") + payload = self._payload(keywords=["cancer FR"], language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_201_CREATED) def test_create_translation_no_arrays_ok(self): # canonical has keywords, translation omits them entirely → 201 ds = self._make_dataset(keywords=["cancer"]) - payload = self._payload(ds, language="fr") + payload = self._payload(language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_201_CREATED) From 396f224d9732eb7f590ed81af3bb7395ab9df05e Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 9 Jun 2026 11:15:54 -0400 Subject: [PATCH 04/15] feat(chord): expose available translations in dataset serializer response --- chord_metadata_service/chord/api_views.py | 1 + chord_metadata_service/chord/serializers.py | 1 + 2 files changed, 2 insertions(+) diff --git a/chord_metadata_service/chord/api_views.py b/chord_metadata_service/chord/api_views.py index 113d58ea1..6ea79b28f 100644 --- a/chord_metadata_service/chord/api_views.py +++ b/chord_metadata_service/chord/api_views.py @@ -161,6 +161,7 @@ def get_queryset(self): project_id = self.request.query_params.get("project_id") if project_id: queryset = queryset.filter(project_id=project_id) + queryset = queryset.prefetch_related("translations") language = _get_preferred_language(self.request) if language != "en": queryset = queryset.prefetch_related( diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index e9dbb34c2..25e997d68 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -89,6 +89,7 @@ def to_representation(self, instance): data['created_at'] = instance.created_at data['updated_at'] = instance.updated_at data['counts_by_entity'] = self.get_counts_by_entity(instance) + data['translations'] = [t.language for t in instance.translations.all()] return data def get_counts_by_entity(self, obj): From 32ecd855e5694cbfdcca0c40960adc7dc0a4240a Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 9 Jun 2026 15:21:53 -0400 Subject: [PATCH 05/15] feat(chord): extend translation constraints to all shared fields Previously only list fields were checked. Non-list optional fields (e.g. spatial_coverage, program_name) could be silently nulled out in translations. Now any field present in canonical must also be present in the translation; list fields must additionally match length. --- chord_metadata_service/chord/serializers.py | 38 ++++++++++--------- .../chord/tests/test_api_translations.py | 12 +++--- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 25e997d68..765c4112d 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -135,7 +135,8 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): """ Validate two invariants that translations must respect vs. the canonical (English) dataset: 1. roles on primary_contact and each stakeholder cannot change. - 2. no list field may gain items (translations can omit or keep, not add). + 2. no field present in canonical may be removed (set to None) in a translation; + for list fields, lengths must also match exactly. """ try: dataset = Dataset.objects.get(identifier=str(translation.identifier)) @@ -159,23 +160,24 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): f"Stakeholder at index {i}: roles cannot change in a translation." ) - # Rule 2: arrays cannot grow - _LIST_FIELDS = ( - "taxa", "keywords", "resources", "stakeholders", "counts", - "links", "publications", "logos", "participant_criteria", "domain", - ) - for field in _LIST_FIELDS: - c_val = getattr(canonical, field, None) or [] - t_val = getattr(translation, field, None) or [] - if isinstance(c_val, list) and isinstance(t_val, list) and len(t_val) > len(c_val): - errors[field] = [f"Translation cannot add items to '{field}' (canonical has {len(c_val)})."] - - c_fs = canonical.funding_sources - t_fs = translation.funding_sources - if isinstance(c_fs, list) and isinstance(t_fs, list) and len(t_fs) > len(c_fs): - errors["funding_sources"] = [ - f"Translation cannot add items to 'funding_sources' (canonical has {len(c_fs)})." - ] + # Rule 2: translations cannot remove or add data to any shared field + shared_fields = canonical.model_fields.keys() & translation.model_fields.keys() + for field in shared_fields: + c_val = getattr(canonical, field, None) + t_val = getattr(translation, field, None) + + if c_val is None or (isinstance(c_val, list) and len(c_val) == 0): + continue # canonical has nothing here; translation free to omit too + + if t_val is None: + errors[field] = [f"Translation cannot remove '{field}' (present in canonical)."] + elif isinstance(c_val, list): + t_len = len(t_val) if isinstance(t_val, list) else 0 + if t_len != len(c_val): + errors[field] = [ + f"Translation must have the same number of items in '{field}' as canonical " + f"(canonical has {len(c_val)}, translation has {t_len})." + ] if errors: raise serializers.ValidationError(errors) diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index 7a8263069..a47a2e96a 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -324,16 +324,16 @@ def test_create_translation_array_same_size_ok(self): r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_201_CREATED) - def test_create_translation_array_shrinks_ok(self): - # fewer keywords than canonical → 201 + def test_create_translation_array_shrinks_rejected(self): + # fewer keywords than canonical → 400 ds = self._make_dataset(keywords=["cancer", "genomics"]) payload = self._payload(keywords=["cancer FR"], language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) - self.assertEqual(r.status_code, status.HTTP_201_CREATED) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) - def test_create_translation_no_arrays_ok(self): - # canonical has keywords, translation omits them entirely → 201 + def test_create_translation_omits_arrays_rejected(self): + # canonical has keywords, translation omits them entirely → 400 ds = self._make_dataset(keywords=["cancer"]) payload = self._payload(language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) - self.assertEqual(r.status_code, status.HTTP_201_CREATED) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) From 7f9f2b5c2c286629b82658ee3fafe05cba4d7c54 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Wed, 10 Jun 2026 13:25:04 -0400 Subject: [PATCH 06/15] feat(chord): reject discovery config and immutable fields in translations Translations cannot include a discovery configuration (rejected before Pydantic parsing, since extra="allow" would silently accept it). Non-translatable fields (version, release_date, last_modified, study_status, study_context) must exactly match the canonical dataset value; providing a different value or adding one where canonical has none is rejected. --- chord_metadata_service/chord/serializers.py | 19 ++++- .../chord/tests/test_api_translations.py | 74 +++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 765c4112d..d767ba175 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -131,12 +131,17 @@ def _roles_for(contact) -> list: return getattr(contact, "roles", []) or [] +_IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context"}) + + def _check_translation_constraints(translation: ProjectScopedDatasetModel): """ - Validate two invariants that translations must respect vs. the canonical (English) dataset: + Validate three invariants that translations must respect vs. the canonical (English) dataset: 1. roles on primary_contact and each stakeholder cannot change. 2. no field present in canonical may be removed (set to None) in a translation; for list fields, lengths must also match exactly. + 3. non-translatable fields (version, release_date, last_modified, study_status, + study_context) must equal the canonical value exactly. """ try: dataset = Dataset.objects.get(identifier=str(translation.identifier)) @@ -179,6 +184,16 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): f"(canonical has {len(c_val)}, translation has {t_len})." ] + # Rule 3: non-translatable fields must be identical to canonical + for field in _IMMUTABLE_FIELDS: + c_val = getattr(canonical, field, None) + t_val = getattr(translation, field, None) + if c_val != t_val: + errors[field] = [ + f"'{field}' cannot change in a translation " + f"(expected {c_val!r}, got {t_val!r})." + ] + if errors: raise serializers.ValidationError(errors) @@ -200,6 +215,8 @@ def _resolve_dataset_id(self) -> str | None: return None def to_internal_value(self, data): + if "discovery" in data: + raise serializers.ValidationError({"discovery": ["Translations cannot include a discovery configuration."]}) dataset_id = self._resolve_dataset_id() if dataset_id is not None: try: diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index a47a2e96a..5e7799fca 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -337,3 +337,77 @@ def test_create_translation_omits_arrays_rejected(self): payload = self._payload(language="fr") r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + + def test_create_translation_with_discovery_rejected(self): + # translations cannot include a discovery config → 400 + ds = self._make_dataset() + payload = self._payload(language="fr", discovery={"rules": {}}) + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("discovery", r.json()) + + def test_update_translation_with_discovery_rejected(self): + # PUT with discovery config on existing translation → 400 + ds = self._make_dataset() + self._make_translation_in_db(ds, "fr") + payload = self._payload(language="fr", discovery={"rules": {}}) + r = self.one_authz_put(self._detail_url(ds, "fr"), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("discovery", r.json()) + + # ---- Rule 3: non-translatable fields must match canonical ---- + + def test_create_translation_version_change_rejected(self): + # canonical version "1.0", translation sends "2.0" → 400 + ds = self._make_dataset(version="1.0") + payload = self._payload(version="2.0", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("version", r.json()) + + def test_create_translation_version_same_ok(self): + # same version → 201 + ds = self._make_dataset(version="1.0") + payload = self._payload(version="1.0", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) + + def test_create_translation_study_status_change_rejected(self): + # canonical study_status "ONGOING", translation sends "COMPLETED" → 400 + ds = self._make_dataset(study_status="ONGOING") + payload = self._payload(study_status="COMPLETED", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("study_status", r.json()) + + def test_create_translation_study_context_change_rejected(self): + # canonical study_context "CLINICAL", translation sends "RESEARCH" → 400 + ds = self._make_dataset(study_context="CLINICAL") + payload = self._payload(study_context="RESEARCH", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("study_context", r.json()) + + def test_create_translation_release_date_change_rejected(self): + # canonical release_date "2024-01-01", translation sends different date → 400 + ds = self._make_dataset(release_date="2024-01-01") + payload = self._payload(release_date="2025-06-01", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("release_date", r.json()) + + def test_create_translation_last_modified_change_rejected(self): + # canonical last_modified "2024-01-01", translation sends different date → 400 + ds = self._make_dataset(last_modified="2024-01-01") + payload = self._payload(last_modified="2025-06-01", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("last_modified", r.json()) + + def test_create_translation_adds_immutable_field_rejected(self): + # canonical has no study_status; translation provides one → 400 + ds = self._make_dataset() + payload = self._payload(study_status="ONGOING", language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) + self.assertIn("study_status", r.json()) From a00bfac23cda69c71c2363f9fa3ca80c951a544a Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Wed, 10 Jun 2026 14:04:24 -0400 Subject: [PATCH 07/15] feat(chord): allow omitting immutable fields in translations Immutable fields (version, release_date, last_modified, study_status, study_context) are now exempt from Rule 2 (removal check). Rule 3 only fires when the field is explicitly provided and differs from canonical. --- chord_metadata_service/chord/serializers.py | 8 ++++++-- .../chord/tests/test_api_translations.py | 7 +++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index d767ba175..d84bd53e8 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -166,8 +166,12 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): ) # Rule 2: translations cannot remove or add data to any shared field + # (immutable fields are exempt — omitting them is always allowed) shared_fields = canonical.model_fields.keys() & translation.model_fields.keys() for field in shared_fields: + if field in _IMMUTABLE_FIELDS: + continue + c_val = getattr(canonical, field, None) t_val = getattr(translation, field, None) @@ -184,11 +188,11 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): f"(canonical has {len(c_val)}, translation has {t_len})." ] - # Rule 3: non-translatable fields must be identical to canonical + # Rule 3: if an immutable field is present in the translation it must match canonical exactly for field in _IMMUTABLE_FIELDS: c_val = getattr(canonical, field, None) t_val = getattr(translation, field, None) - if c_val != t_val: + if t_val is not None and t_val != c_val: errors[field] = [ f"'{field}' cannot change in a translation " f"(expected {c_val!r}, got {t_val!r})." diff --git a/chord_metadata_service/chord/tests/test_api_translations.py b/chord_metadata_service/chord/tests/test_api_translations.py index 5e7799fca..bbf11ebcf 100644 --- a/chord_metadata_service/chord/tests/test_api_translations.py +++ b/chord_metadata_service/chord/tests/test_api_translations.py @@ -411,3 +411,10 @@ def test_create_translation_adds_immutable_field_rejected(self): r = self.one_authz_post(self._list_url(ds), json=payload) self.assertEqual(r.status_code, status.HTTP_400_BAD_REQUEST) self.assertIn("study_status", r.json()) + + def test_create_translation_omits_immutable_field_ok(self): + # canonical has version; translation omits it entirely → 201 + ds = self._make_dataset(version="1.0") + payload = self._payload(language="fr") + r = self.one_authz_post(self._list_url(ds), json=payload) + self.assertEqual(r.status_code, status.HTTP_201_CREATED) From 8de4a1fae6b01f39970375bd310e2f3dd0260871 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Wed, 10 Jun 2026 15:00:37 -0400 Subject: [PATCH 08/15] feat(chord): extend translation constraints to discovery and dac_id Add discovery and dac_id to _IMMUTABLE_FIELDS so Rule 2 does not require them to be present in a translation. Rule 3 still rejects them if explicitly provided with a value that differs from canonical. --- chord_metadata_service/chord/serializers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index d84bd53e8..e354b2e5c 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -131,7 +131,8 @@ def _roles_for(contact) -> list: return getattr(contact, "roles", []) or [] -_IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context"}) +_IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context", + "discovery", "dac_id"}) def _check_translation_constraints(translation: ProjectScopedDatasetModel): @@ -141,7 +142,8 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): 2. no field present in canonical may be removed (set to None) in a translation; for list fields, lengths must also match exactly. 3. non-translatable fields (version, release_date, last_modified, study_status, - study_context) must equal the canonical value exactly. + study_context, discovery, dac_id) must equal the canonical value exactly if + provided; omitting them is allowed. """ try: dataset = Dataset.objects.get(identifier=str(translation.identifier)) From d7cb3a06a2da761d304dd87e3776749c4f4774ae Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Wed, 10 Jun 2026 15:44:44 -0400 Subject: [PATCH 09/15] fix(chord): update dac_id field name to pcgl_dac_id in immutable constraints --- chord_metadata_service/chord/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index e354b2e5c..521d8a7cc 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -132,7 +132,7 @@ def _roles_for(contact) -> list: _IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context", - "discovery", "dac_id"}) + "discovery", "pcgl_dac_id"}) def _check_translation_constraints(translation: ProjectScopedDatasetModel): From 402fe684dee78f3e37e2c68c19e70e74d2403aed Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Mon, 15 Jun 2026 11:50:59 -0400 Subject: [PATCH 10/15] docs(chord): add docstring to _roles_for helper --- chord_metadata_service/chord/serializers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 521d8a7cc..9813093cf 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -128,6 +128,7 @@ def update(self, instance, validated_data): def _roles_for(contact) -> list: + """Return roles list for a contact, defaulting to empty list.""" return getattr(contact, "roles", []) or [] From 72576ba72d77fd444cd47cae161e96931119f388 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Mon, 15 Jun 2026 13:21:29 -0400 Subject: [PATCH 11/15] fix(chord): discovery translation was forced to be not included, immutable check not required. --- chord_metadata_service/chord/serializers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 9813093cf..9a9c39278 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -133,7 +133,7 @@ def _roles_for(contact) -> list: _IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context", - "discovery", "pcgl_dac_id"}) + "pcgl_dac_id"}) def _check_translation_constraints(translation: ProjectScopedDatasetModel): From 75e38802617567d252e4e76ba7442f0ee9c746e9 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 16 Jun 2026 01:57:35 -0400 Subject: [PATCH 12/15] chore: trigger CI --- chord_metadata_service/chord/serializers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 9a9c39278..076e68097 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -1,4 +1,5 @@ import uuid +# trigger CI from pydantic import ValidationError as PydValidationError from bento_lib.discovery import DiscoveryConfig from bento_lib.provenance.dataset import ProjectScopedDatasetModel From df310228fc18b47b63137286f8bf91d2fe766595 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 16 Jun 2026 02:05:30 -0400 Subject: [PATCH 13/15] Revert "chore: trigger CI" This reverts commit 75e38802617567d252e4e76ba7442f0ee9c746e9. --- chord_metadata_service/chord/serializers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 076e68097..9a9c39278 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -1,5 +1,4 @@ import uuid -# trigger CI from pydantic import ValidationError as PydValidationError from bento_lib.discovery import DiscoveryConfig from bento_lib.provenance.dataset import ProjectScopedDatasetModel From f57f90eb121df8263e5f60e9d214eb14915d60dc Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 23 Jun 2026 11:00:10 -0400 Subject: [PATCH 14/15] fix(dataset): add discovery to immutable fields to allow omission in translations discovery was missing from _IMMUTABLE_FIELDS, causing Rule 2 to reject translations that omitted it even though it is non-translatable content. --- chord_metadata_service/chord/serializers.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 9a9c39278..4b0f4493a 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -132,8 +132,10 @@ def _roles_for(contact) -> list: return getattr(contact, "roles", []) or [] +# 'discovery' is also explicitly blocked in DatasetTranslationSerializer.to_internal_value, +# but must be immutable here too so Rule 2 doesn't fire when it's omitted. _IMMUTABLE_FIELDS = frozenset({"version", "release_date", "last_modified", "study_status", "study_context", - "pcgl_dac_id"}) + "discovery", "pcgl_dac_id"}) def _check_translation_constraints(translation: ProjectScopedDatasetModel): From d8740522d93bf6355363829d286fee8ee0a9f4e6 Mon Sep 17 00:00:00 2001 From: Sanjeev Lakhwani Date: Tue, 23 Jun 2026 13:49:11 -0400 Subject: [PATCH 15/15] fix(dataset): block translations from introducing fields absent in canonical --- chord_metadata_service/chord/serializers.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/chord_metadata_service/chord/serializers.py b/chord_metadata_service/chord/serializers.py index 4b0f4493a..57064648c 100644 --- a/chord_metadata_service/chord/serializers.py +++ b/chord_metadata_service/chord/serializers.py @@ -142,8 +142,8 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): """ Validate three invariants that translations must respect vs. the canonical (English) dataset: 1. roles on primary_contact and each stakeholder cannot change. - 2. no field present in canonical may be removed (set to None) in a translation; - for list fields, lengths must also match exactly. + 2. no field present in canonical may be removed (set to None) in a translation, and + no field absent in canonical may be introduced; for list fields, lengths must also match exactly. 3. non-translatable fields (version, release_date, last_modified, study_status, study_context, discovery, dac_id) must equal the canonical value exactly if provided; omitting them is allowed. @@ -180,8 +180,12 @@ def _check_translation_constraints(translation: ProjectScopedDatasetModel): c_val = getattr(canonical, field, None) t_val = getattr(translation, field, None) - if c_val is None or (isinstance(c_val, list) and len(c_val) == 0): - continue # canonical has nothing here; translation free to omit too + c_empty = c_val is None or (isinstance(c_val, list) and len(c_val) == 0) + t_empty = t_val is None or (isinstance(t_val, list) and len(t_val) == 0) + if c_empty: + if not t_empty: + errors[field] = [f"Translation cannot introduce '{field}' (not present in canonical)."] + continue if t_val is None: errors[field] = [f"Translation cannot remove '{field}' (present in canonical)."]