From 8d7083a6692ce838c30e3e535ca3484789ac1e77 Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Mon, 22 Jun 2026 21:43:20 -0700 Subject: [PATCH 1/3] feat(migrations): add zero-downtime migration capability - Safe-DDL backend: lock timeout, fail-fast, runtime unsafe-op guard - CI linting of new migrations on pull requests - Declarative dual-write trigger decorator (mirror_field) - Reusable batched-backfill command (idempotent, resumable, throttled) Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LfZvkigk8hdsKdEif3hzBi --- .github/workflows/pythontest.yml | 13 ++ .../contentcuration/db/backends/__init__.py | 0 .../zero_downtime_prometheus/__init__.py | 0 .../backends/zero_downtime_prometheus/base.py | 12 ++ .../contentcuration/db/dual_write.py | 31 +++ .../management/commands/backfill_column.py | 100 +++++++++ .../not_production_settings.py | 9 + .../contentcuration/production_settings.py | 2 +- contentcuration/contentcuration/settings.py | 10 +- .../contentcuration/test_settings.py | 3 + .../tests/test_backfill_column.py | 191 ++++++++++++++++++ .../contentcuration/tests/test_dual_write.py | 51 +++++ requirements-dev.in | 1 + requirements-dev.txt | 9 +- requirements.in | 2 + requirements.txt | 8 +- 16 files changed, 438 insertions(+), 4 deletions(-) create mode 100644 contentcuration/contentcuration/db/backends/__init__.py create mode 100644 contentcuration/contentcuration/db/backends/zero_downtime_prometheus/__init__.py create mode 100644 contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py create mode 100644 contentcuration/contentcuration/db/dual_write.py create mode 100644 contentcuration/contentcuration/management/commands/backfill_column.py create mode 100644 contentcuration/contentcuration/tests/test_backfill_column.py create mode 100644 contentcuration/contentcuration/tests/test_dual_write.py diff --git a/.github/workflows/pythontest.yml b/.github/workflows/pythontest.yml index e77c613e69..609a0a933d 100644 --- a/.github/workflows/pythontest.yml +++ b/.github/workflows/pythontest.yml @@ -62,6 +62,8 @@ jobs: - 6379:6379 steps: - uses: actions/checkout@v6 + with: + fetch-depth: 0 - name: Set up minio run: | docker run -d -p 9000:9000 --name minio \ @@ -79,6 +81,17 @@ jobs: run: | # Use uv to install dependencies directly from requirements files uv pip sync requirements.txt requirements-dev.txt + - name: Lint new migrations for unsafe operations + if: github.event_name == 'pull_request' + env: + BASE_REF: ${{ github.base_ref }} + DJANGO_SETTINGS_MODULE: contentcuration.not_production_settings + run: | + set -euo pipefail + git fetch --no-tags origin "$BASE_REF" + base="$(git merge-base "origin/$BASE_REF" HEAD)" + test -n "$base" + python contentcuration/manage.py lintmigrations --git-commit-id "$base" --no-cache --warnings-as-errors - name: Test pytest run: | sh -c './contentcuration/manage.py makemigrations --check' diff --git a/contentcuration/contentcuration/db/backends/__init__.py b/contentcuration/contentcuration/db/backends/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/__init__.py b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py new file mode 100644 index 0000000000..ad23a39030 --- /dev/null +++ b/contentcuration/contentcuration/db/backends/zero_downtime_prometheus/base.py @@ -0,0 +1,12 @@ +from django_prometheus.db.backends.postgresql.base import ( + DatabaseWrapper as PrometheusDatabaseWrapper, +) +from django_zero_downtime_migrations.backends.postgres.schema import ( + DatabaseSchemaEditor, +) + + +class DatabaseWrapper(PrometheusDatabaseWrapper): + """Prometheus query metrics + zero-downtime safe-DDL schema editor.""" + + SchemaEditorClass = DatabaseSchemaEditor diff --git a/contentcuration/contentcuration/db/dual_write.py b/contentcuration/contentcuration/db/dual_write.py new file mode 100644 index 0000000000..604875670e --- /dev/null +++ b/contentcuration/contentcuration/db/dual_write.py @@ -0,0 +1,31 @@ +import hashlib + +import pgtrigger + + +def mirror_field(source, target): + """Mirror Django field `source` into `target` via a BEFORE INSERT/UPDATE + trigger (expand/contract dual-write).""" + + def decorator(model): + source_col = model._meta.get_field(source).column + target_col = model._meta.get_field(target).column + name = "mirror_{}_to_{}".format(source_col, target_col) + if len(name) > 43: # stay safely under pgtrigger's trigger-name limit + digest = hashlib.sha1( + "{}_{}".format(source_col, target_col).encode() + ).hexdigest()[:8] + name = "mirror_{}".format(digest) + # Change-guard (IS DISTINCT FROM): keeps a read cutover from clobbering + # writes to the repointed column with the stale source value. + trigger = pgtrigger.Trigger( + name=name, + when=pgtrigger.Before, + operation=pgtrigger.Insert | pgtrigger.Update, + func="IF NEW.{s} IS DISTINCT FROM OLD.{s} THEN NEW.{t} = NEW.{s}; END IF; RETURN NEW;".format( + s=source_col, t=target_col + ), + ) + return pgtrigger.register(trigger)(model) + + return decorator diff --git a/contentcuration/contentcuration/management/commands/backfill_column.py b/contentcuration/contentcuration/management/commands/backfill_column.py new file mode 100644 index 0000000000..1920c47af1 --- /dev/null +++ b/contentcuration/contentcuration/management/commands/backfill_column.py @@ -0,0 +1,100 @@ +from django.apps import apps +from django.core.exceptions import FieldDoesNotExist +from django.core.management.base import BaseCommand +from django.core.management.base import CommandError +from django.db import transaction +from django.db.models import F + + +class Command(BaseCommand): + help = ( + "Idempotent, resumable online backfill of one column into another, in batches." + ) + + def add_arguments(self, parser): + parser.add_argument("--model", required=True, help="app_label.ModelName") + parser.add_argument("--source-field", required=True) + parser.add_argument("--target-field", required=True) + parser.add_argument("--batch-size", type=int, default=10000) + parser.add_argument("--start-id", default=None, help="resume from this pk") + parser.add_argument( + "--progress-check", + action="store_true", + help="report unbackfilled rows, exit nonzero if any", + ) + + def _resolve_model_fields(self, model_label, source, target): + try: + model = apps.get_model(model_label) + except (LookupError, ValueError) as e: + raise CommandError("Bad --model {!r}: {}".format(model_label, e)) + try: + model._meta.get_field(source) + model._meta.get_field(target) + except FieldDoesNotExist as e: + raise CommandError(str(e)) + return model + + def _batch_end_pk(self, queryset, pk_name, start_pk, batch_size): + """Last pk of the batch of `batch_size` rows starting at `start_pk`. + + Returns None when fewer than `batch_size` rows remain at/after + `start_pk` — the final, short batch. Keyset paging by pk, so it works + for any pk type (int or UUID). + """ + return ( + queryset.filter(pk__gte=start_pk) + .order_by(pk_name) + .values_list("pk", flat=True)[batch_size - 1 : batch_size] + .first() + ) + + def handle(self, *args, **options): + if options["batch_size"] < 1: + raise CommandError("--batch-size must be >= 1") + source = options["source_field"] + target = options["target_field"] + model = self._resolve_model_fields(options["model"], source, target) + + pk_name = model._meta.pk.name + batch_size = options["batch_size"] + only_unfilled = {target + "__isnull": True, source + "__isnull": False} + unfilled = model.objects.filter(**only_unfilled) + unfilled_pks = unfilled.order_by(pk_name).values_list("pk", flat=True) + + if options["progress_check"]: + # exists(), not count() — the target table can have millions of rows. + if unfilled.exists(): + raise CommandError("backfill incomplete: rows still pending") + self.stdout.write("Backfill complete: no rows pending.") + return + + # Start at the first unfilled pk (>= --start-id if given); re-runs and + # resumes skip straight past an already-filled prefix. + batch_start = unfilled_pks + if options["start_id"] is not None: + batch_start = batch_start.filter(pk__gte=options["start_id"]) + batch_start = batch_start.first() + + total = 0 + while batch_start is not None: + batch_end = self._batch_end_pk( + model.objects, pk_name, batch_start, batch_size + ) + if batch_end is None: + window = {"pk__gte": batch_start} + else: + window = {"pk__gte": batch_start, "pk__lte": batch_end} + with transaction.atomic(): + total += model.objects.filter(**window, **only_unfilled).update( + **{target: F(source)} + ) + self.stdout.write( + "backfilled through pk={} (updated {} so far)".format( + batch_start if batch_end is None else batch_end, total + ) + ) + if batch_end is None: + break + batch_start = unfilled_pks.filter(pk__gt=batch_end).first() + self.stdout.write("Done. {} rows updated.".format(total)) diff --git a/contentcuration/contentcuration/not_production_settings.py b/contentcuration/contentcuration/not_production_settings.py index afcc6460bc..35be6db3c2 100644 --- a/contentcuration/contentcuration/not_production_settings.py +++ b/contentcuration/contentcuration/not_production_settings.py @@ -20,5 +20,14 @@ AWS_AUTO_CREATE_BUCKET = True +INSTALLED_APPS += ("django_migration_linter",) # noqa F405 + +MIGRATION_LINTER_OPTIONS = { + "exclude_apps": [ + "kolibri_content" + ], # SQLite content-export app; not on the safe-DDL Postgres backend + "sql_analyser": "postgresql", +} + # Use local instance for curriculum automation for development CURRICULUM_AUTOMATION_API_URL = "http://localhost:8000" diff --git a/contentcuration/contentcuration/production_settings.py b/contentcuration/contentcuration/production_settings.py index a00bf43a41..92d63cff73 100644 --- a/contentcuration/contentcuration/production_settings.py +++ b/contentcuration/contentcuration/production_settings.py @@ -39,7 +39,7 @@ if SITE_READ_ONLY: CACHES["default"]["BACKEND"] = "django_prometheus.cache.backends.locmem.LocMemCache" -DATABASES["default"]["ENGINE"] = "django_prometheus.db.backends.postgresql" +DATABASES["default"]["ENGINE"] = "contentcuration.db.backends.zero_downtime_prometheus" REST_FRAMEWORK["DEFAULT_RENDERER_CLASSES"] = [ diff --git a/contentcuration/contentcuration/settings.py b/contentcuration/contentcuration/settings.py index 2d8bafaa9b..e9d9c3e4fc 100644 --- a/contentcuration/contentcuration/settings.py +++ b/contentcuration/contentcuration/settings.py @@ -92,6 +92,7 @@ "django_celery_results", "kolibri_public", "automation", + "pgtrigger", ) SESSION_ENGINE = "django.contrib.sessions.backends.cached_db" @@ -193,7 +194,7 @@ DATABASES = { "default": { - "ENGINE": "django.db.backends.postgresql_psycopg2", + "ENGINE": "django_zero_downtime_migrations.backends.postgres", "NAME": os.getenv("DATA_DB_NAME") or "kolibri-studio", # For dev purposes only "USER": os.getenv("DATA_DB_USER") or "learningequality", @@ -204,6 +205,13 @@ }, } +ZERO_DOWNTIME_MIGRATIONS_LOCK_TIMEOUT = "30s" +ZERO_DOWNTIME_MIGRATIONS_STATEMENT_TIMEOUT = "60s" +# don't kill long-but-safe ops (e.g. CREATE INDEX CONCURRENTLY) +ZERO_DOWNTIME_MIGRATIONS_FLEXIBLE_STATEMENT_TIMEOUT = True +# surface unsafe DDL at runtime +ZERO_DOWNTIME_MIGRATIONS_RAISE_FOR_UNSAFE = True + IS_CONTENTNODE_TABLE_PARTITIONED = ( os.getenv("IS_CONTENTNODE_TABLE_PARTITIONED") or False ) diff --git a/contentcuration/contentcuration/test_settings.py b/contentcuration/contentcuration/test_settings.py index a1fcf20e6b..acc80aabd9 100644 --- a/contentcuration/contentcuration/test_settings.py +++ b/contentcuration/contentcuration/test_settings.py @@ -2,6 +2,9 @@ DEBUG = True +# Test DB build replays legacy pre-safe-DDL migrations; runtime and the CI linter still enforce. +ZERO_DOWNTIME_MIGRATIONS_RAISE_FOR_UNSAFE = False + WEBPACK_LOADER["DEFAULT"][ # noqa "LOADER_CLASS" ] = "contentcuration.tests.webpack_loader.TestWebpackLoader" diff --git a/contentcuration/contentcuration/tests/test_backfill_column.py b/contentcuration/contentcuration/tests/test_backfill_column.py new file mode 100644 index 0000000000..467b967f0f --- /dev/null +++ b/contentcuration/contentcuration/tests/test_backfill_column.py @@ -0,0 +1,191 @@ +import uuid +from io import StringIO +from unittest.mock import patch + +from django.core.management import call_command +from django.core.management import CommandError +from django.db import connection +from django.db import models +from django.db.models import F +from django.test import SimpleTestCase +from django.test import TransactionTestCase +from django.test.utils import isolate_apps + + +def _make_probe_class(): + class Probe(models.Model): + source = models.IntegerField(null=True) + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + return Probe + + +def _make_uuid_probe_class(): + class UUIDProbe(models.Model): + id = models.UUIDField(primary_key=True, default=uuid.uuid4) + source = models.IntegerField(null=True) + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + return UUIDProbe + + +@isolate_apps("contentcuration") +class BackfillColumnTestCase(TransactionTestCase): + def _create_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.create_model(model) + self.addCleanup(self._delete_model, model) + + def _delete_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.delete_model(model) + + def _run_backfill(self, model, **kwargs): + out = StringIO() + with patch( + "contentcuration.management.commands.backfill_column.apps", + model._meta.apps, + ): + call_command( + "backfill_column", + stdout=out, + model="contentcuration.{}".format(model.__name__), + source_field="source", + target_field="shadow", + **kwargs, + ) + return out.getvalue() + + def _assert_all_synced(self, model): + self.assertEqual( + model.objects.count(), model.objects.filter(shadow=F("source")).count() + ) + + def _assert_synced_from(self, model, resume_pk): + below = model.objects.filter(pk__lt=resume_pk) + at_or_above = model.objects.filter(pk__gte=resume_pk) + self.assertEqual(below.count(), below.filter(shadow__isnull=True).count()) + self.assertEqual( + at_or_above.count(), at_or_above.filter(shadow=F("source")).count() + ) + + def test_backfills_all_rows(self): + Probe = _make_probe_class() + self._create_model(Probe) + for i in range(1, 6): + Probe.objects.create(source=i * 10, shadow=None) + + # batch_size=2 over 5 rows exercises the multi-batch loop. + self._run_backfill(Probe, batch_size=2) + + self._assert_all_synced(Probe) + + def test_idempotent(self): + Probe = _make_probe_class() + self._create_model(Probe) + for i in range(1, 4): + Probe.objects.create(source=i * 10, shadow=None) + + self._run_backfill(Probe, batch_size=10) + output = self._run_backfill(Probe, batch_size=10) + + self.assertIn("Done. 0 rows updated.", output) + self._assert_all_synced(Probe) + + def test_resumable(self): + Probe = _make_probe_class() + self._create_model(Probe) + objs = sorted( + [Probe.objects.create(source=i * 10, shadow=None) for i in range(1, 6)], + key=lambda o: o.pk, + ) + resume_pk = objs[2].pk + + self._run_backfill(Probe, batch_size=10, start_id=resume_pk) + + self._assert_synced_from(Probe, resume_pk) + + def test_null_source_safe(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=None, shadow=None) + Probe.objects.create(source=42, shadow=None) + + output = self._run_backfill(Probe, batch_size=10) + + self.assertIn("Done.", output) + self.assertIsNone(Probe.objects.get(source__isnull=True).shadow) + self.assertEqual(Probe.objects.get(source=42).shadow, 42) + + def test_backfills_uuid_pk_across_batches(self): + """Regression: paging must not assume an integer pk (File has a UUID pk).""" + UUIDProbe = _make_uuid_probe_class() + self._create_model(UUIDProbe) + for i in range(1, 6): + UUIDProbe.objects.create(source=i * 10, shadow=None) + + # batch_size=2 forces the lower-bound advance where integer arithmetic on a + # UUID pk would blow up. + self._run_backfill(UUIDProbe, batch_size=2) + + self._assert_all_synced(UUIDProbe) + + def test_resumable_uuid_pk(self): + """--start-id must accept a UUID and resume from it.""" + UUIDProbe = _make_uuid_probe_class() + self._create_model(UUIDProbe) + objs = sorted( + [UUIDProbe.objects.create(source=i * 10, shadow=None) for i in range(1, 6)], + key=lambda o: o.pk, + ) + resume_pk = objs[2].pk + + self._run_backfill(UUIDProbe, batch_size=10, start_id=str(resume_pk)) + + self._assert_synced_from(UUIDProbe, resume_pk) + + def test_progress_check_passes_when_complete(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=1, shadow=1) + Probe.objects.create(source=None, shadow=None) # null source doesn't count + + output = self._run_backfill(Probe, progress_check=True) + + self.assertIn("no rows pending", output) + + def test_progress_check_fails_and_writes_nothing_when_incomplete(self): + Probe = _make_probe_class() + self._create_model(Probe) + Probe.objects.create(source=7, shadow=None) + + with self.assertRaisesRegex( + CommandError, "backfill incomplete: rows still pending" + ): + self._run_backfill(Probe, progress_check=True) + + # --progress-check must not write + self.assertIsNone(Probe.objects.get(source=7).shadow) + + +class BackfillColumnArgValidationTestCase(SimpleTestCase): + def _call(self, **kwargs): + call_command( + "backfill_column", + model="contentcuration.Channel", + source_field="name", + target_field="name", + **kwargs, + ) + + def test_non_positive_batch_size_raises(self): + for bad in (0, -1): + with self.subTest(batch_size=bad): + with self.assertRaisesRegex(CommandError, "--batch-size must be >= 1"): + self._call(batch_size=bad) diff --git a/contentcuration/contentcuration/tests/test_dual_write.py b/contentcuration/contentcuration/tests/test_dual_write.py new file mode 100644 index 0000000000..f123b39ad8 --- /dev/null +++ b/contentcuration/contentcuration/tests/test_dual_write.py @@ -0,0 +1,51 @@ +from django.db import connection +from django.db import models +from django.test import TransactionTestCase +from django.test.utils import isolate_apps + +from contentcuration.db.dual_write import mirror_field + + +@isolate_apps("contentcuration") +class MirrorFieldTestCase(TransactionTestCase): + def _delete_model(self, model): + with connection.schema_editor(atomic=False) as editor: + editor.delete_model(model) + + def test_long_field_names_truncate_trigger_name(self): + """Trigger name from long field names is truncated, not raised.""" + + @mirror_field("a_very_long_source_field_name", "a_very_long_target_field_name") + class LongNameProbe(models.Model): + a_very_long_source_field_name = models.IntegerField() + a_very_long_target_field_name = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + self.assertEqual(len(LongNameProbe._meta.triggers), 1) + self.assertLessEqual(len(LongNameProbe._meta.triggers[0].name), 43) + + def test_syncs_shadow_column_in_db(self): + @mirror_field("source", "shadow") + class Probe(models.Model): + source = models.IntegerField() + shadow = models.IntegerField(null=True) + + class Meta: + app_label = "contentcuration" + + with connection.schema_editor(atomic=False) as editor: + editor.create_model(Probe) + self.addCleanup(self._delete_model, Probe) + for trigger in Probe._meta.triggers: + trigger.install(Probe) + + obj = Probe.objects.create(source=5) + obj.refresh_from_db() + self.assertEqual(obj.shadow, 5) + + obj.source = 9 + obj.save() + obj.refresh_from_db() + self.assertEqual(obj.shadow, 9) diff --git a/requirements-dev.in b/requirements-dev.in index 8a9224102c..7cd6acd113 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -8,3 +8,4 @@ pytest-timeout pre-commit==4.5.1 nodeenv drf-yasg==1.21.10 +django-migration-linter==6.0.0 diff --git a/requirements-dev.txt b/requirements-dev.txt index ac6539f932..7391a7cb93 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,5 +1,7 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements-dev.in --output-file requirements-dev.txt +# uv pip compile requirements-dev.in -o requirements-dev.txt +appdirs==1.4.4 + # via django-migration-linter asgiref==3.3.4 # via # -c requirements.txt @@ -11,10 +13,13 @@ distlib==0.3.9 django==3.2.24 # via # -c requirements.txt + # django-migration-linter # djangorestframework # drf-yasg django-concurrent-test-helper==0.7.0 # via -r requirements-dev.in +django-migration-linter==6.0.0 + # via -r requirements-dev.in djangorestframework==3.15.1 # via # -c requirements.txt @@ -91,6 +96,8 @@ sqlparse==0.4.1 # django tblib==1.7.0 # via django-concurrent-test-helper +toml==0.10.2 + # via django-migration-linter tomli==1.2.3 # via pytest typing-extensions==4.15.0 diff --git a/requirements.in b/requirements.in index 2d59962414..6d24f23061 100644 --- a/requirements.in +++ b/requirements.in @@ -39,3 +39,5 @@ pydantic==2.12.5 latex2mathml==3.78.1 markdown-it-py==4.0.0 stripe>=5.0.0,<6.0.0 +django-pg-zero-downtime-migrations==0.14 # 0.14 adds Postgres 16 support +django-pgtrigger<4.12.0 # 4.12 drops Django 3.2 support diff --git a/requirements.txt b/requirements.txt index 0d66015b7a..2eb36c6ce5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in --output-file requirements.txt +# uv pip compile requirements.in -o requirements.txt amqp==5.1.1 # via kombu annotated-types==0.7.0 @@ -58,6 +58,8 @@ django==3.2.24 # django-filter # django-js-reverse # django-model-utils + # django-pg-zero-downtime-migrations + # django-pgtrigger # django-redis # django-registration # django-s3-storage @@ -79,6 +81,10 @@ django-model-utils==5.0.0 # via -r requirements.in django-mptt==0.16.0 # via -r requirements.in +django-pg-zero-downtime-migrations==0.14 + # via -r requirements.in +django-pgtrigger==4.11.0 + # via -r requirements.in django-postmark==0.1.6 # via -r requirements.in django-prometheus==2.3.1 From b631c2543b552e99b971d3c8ad59d77cb06a7dd7 Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Mon, 22 Jun 2026 21:43:47 -0700 Subject: [PATCH 2/3] docs(migrations): add expand/contract zero-downtime runbook Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LfZvkigk8hdsKdEif3hzBi --- docs/_index.md | 4 ++ docs/zero_downtime_migrations.md | 107 +++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 docs/zero_downtime_migrations.md diff --git a/docs/_index.md b/docs/_index.md index c3e67006ba..072419ef13 100644 --- a/docs/_index.md +++ b/docs/_index.md @@ -17,6 +17,10 @@ - [Docker + Kubernetes Studio Instance Setup](./docker_kubernetes_setup.md) +## Database + +- [Zero-downtime migrations (expand/contract runbook)](./zero_downtime_migrations.md) + ## API - [API Endpoints](./api_endpoints.md) diff --git a/docs/zero_downtime_migrations.md b/docs/zero_downtime_migrations.md new file mode 100644 index 0000000000..3db56bf140 --- /dev/null +++ b/docs/zero_downtime_migrations.md @@ -0,0 +1,107 @@ +# Almost zero-downtime migrations — expand/contract runbook + +On large tables (e.g. `File` has ~100 M rows) a single migration can cause downtime in two ways: +- by taking an `ACCESS EXCLUSIVE` lock / rewriting the table +- by shipping a schema the still-running old pods can't use (a dropped or renamed column). + +The expand/contract procedure below avoids both. Its one residual cost is the brief metadata-only lock taken for the drop + rename migration, hence "almost." + +## Guards (already configured) + +- `django-pg-zero-downtime-migrations` - Lock/statement timeouts and `RAISE_FOR_UNSAFE` make rewriting or exclusively-locking DDL fail at migrate time. ([package docs](https://github.com/tbicr/django-pg-zero-downtime-migrations#configuration)). +- `django-migration-linter` - flags backward-incompatible schema (drops, renames, NOT NULL adds) old pods would break on. + +`test_settings.py` sets `RAISE_FOR_UNSAFE = False` so the test DB can replay pre-guard legacy migrations; runtime and CI still enforce both guards. + +## Procedure + +Goal: widen `File.file_size` from int to bigint with no table rewrite and no backward-incompatible window. The app-visible column stays named `file_size` throughout. Only its underlying storage swaps — from the int column to a pre-backfilled bigint column. Because the name is preserved, old pods keep writing to `file_size` (now bigint) without error. + +### Release 1 — expand + +Add the shadow field and the dual-write trigger: + +```python +from contentcuration.db.dual_write import mirror_field + +@mirror_field("file_size", "file_size_bigint") +class File(models.Model): + file_size = models.IntegerField(blank=True, null=True) + file_size_bigint = models.BigIntegerField(blank=True, null=True) +``` + +`makemigrations` emits a nullable `AddField` and the `CreateTrigger` — both safe (no rewrite, no lock). New writes now land in both columns. + +Backfill old rows in the same release: wire `backfill_column` as a `deploy-migrate` step in the Makefile, which runs after `migrate`, so the column and trigger already exist: + +```bash +python contentcuration/manage.py backfill_column \ + --model contentcuration.File --source-field file_size --target-field file_size_bigint +``` + +Can also run the above command with `--progress-check` as a read only to see if any backfills are still required. + +### Release 2 — swap (cutover + rename) + +After backfill completes, swap the storage in a single migration. Drop the shadow field and decorator; `file_size` is now bigint: + +```python +class File(models.Model): + file_size = models.BigIntegerField(blank=True, null=True) +``` + +The migration drops the trigger and the int column, then renames the bigint column onto `file_size`: + +```python +operations = [ + IgnoreMigration(), # safe: net change is an int->bigint widening; see note below + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.RemoveField("file", "file_size_bigint"), + migrations.AlterField( + "file", "file_size", models.BigIntegerField(blank=True, null=True) + ), + pgtrigger.migrations.RemoveTrigger( + "file", "mirror_file_size_to_file_size_bigint" + ), + ], + database_operations=[ + migrations.RunSQL( + sql=( + "DROP TRIGGER IF EXISTS pgtrigger_mirror_file_size_to_file_size_bigint_54326" + " ON contentcuration_file;" + 'ALTER TABLE contentcuration_file DROP COLUMN "file_size";' + 'ALTER TABLE contentcuration_file RENAME COLUMN "file_size_bigint" TO "file_size";' + ), + reverse_sql=( + 'ALTER TABLE contentcuration_file RENAME COLUMN "file_size" TO "file_size_bigint";' + 'ALTER TABLE contentcuration_file ADD COLUMN "file_size" integer;' + ), + ), + ], + ), +] +``` + +`SeparateDatabaseAndState` allows us to let Django know what has been migrated, while doing specific raw SQL operations to get the exact data preserving sequence of events that we want. Copy the trigger `pgid` from release 1's `AddTrigger`. + +Why the swap is transparent to old pods: + +- Their queries reference `file_size` by name; the swap preserves that name, so they keep working — their int writes fit the bigint column. +- The net app-visible change is an `int → bigint` widening, which is backward-compatible. +- The only disruption is the brief metadata-only lock while the DDL runs; `DROP COLUMN` / `RENAME COLUMN` don't rewrite the table. + +The linter flags the drop and rename as backward-incompatible; `IgnoreMigration()` acknowledges the sequencing makes them safe. + +**Don't cut over to the physical name first.** Aliasing the ORM field to `file_size_bigint` via `db_column` creates a pod generation that queries `file_size_bigint` by name. The later rename then breaks that generation for the whole rollover, and adds a release. Preserving `file_size` is what makes the rename free. + +## Tooling + +- **`@mirror_field(source, target)`** in `contentcuration/db/dual_write.py` — BEFORE INSERT/UPDATE trigger copying field `source` → `target`. Change-guarded: an unconditional copy corrupts data at swap. +- **`backfill_column`** — idempotent, resumable (`--start-id `), batched (`--batch-size`); one transaction per batch. `--progress-check` tests for remaining rows without writing and exits nonzero if any remain. +- **`lintmigrations`** — run locally before pushing: + ```bash + python contentcuration/manage.py lintmigrations --git-commit-id --no-cache --warnings-as-errors + ``` + `--git-commit-id` is a flag, not positional — a positional value is read as an app label and lints nothing. +- **`IgnoreMigration()`** — escape hatch for a migration whose backward-incompatibility is made safe by release sequencing. From 84f402d23f5fa9b37945dea6ac5d8dbd06e034ff Mon Sep 17 00:00:00 2001 From: Richard Tibbles Date: Mon, 22 Jun 2026 23:26:15 -0700 Subject: [PATCH 3/3] feat(models): widen File.file_size to bigint, expand stage (studio#5974) Expand stage of the zero-downtime int->bigint widening: - Add nullable file_size_bigint shadow column and its index (built CONCURRENTLY). - Mirror file_size into it via the change-guarded @mirror_field trigger. - Wire the online backfill as a commented deploy-migrate step. - Stage the cutover and contract steps as comments on the File model. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01LfZvkigk8hdsKdEif3hzBi --- Makefile | 3 +- .../0167_file_size_bigint_expand.py | 43 +++++++++++++++++++ contentcuration/contentcuration/models.py | 19 ++++++++ 3 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py diff --git a/Makefile b/Makefile index 27c222a7d2..2f7ff57212 100644 --- a/Makefile +++ b/Makefile @@ -39,7 +39,8 @@ migrate: # 4) Remove the management command from this `deploy-migrate` recipe # 5) Repeat! deploy-migrate: - echo "Nothing to do here!" + # studio#5974: remove at cutover. + python contentcuration/manage.py backfill_column --model contentcuration.File --source-field file_size --target-field file_size_bigint contentnodegc: python contentcuration/manage.py garbage_collect diff --git a/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py b/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py new file mode 100644 index 0000000000..6fc8d87188 --- /dev/null +++ b/contentcuration/contentcuration/migrations/0167_file_size_bigint_expand.py @@ -0,0 +1,43 @@ +# Generated by Django 3.2.24 on 2026-06-23 05:56 +import pgtrigger.compiler +import pgtrigger.migrations +from django.db import migrations +from django.db import models +from django.db.models import Q + + +class Migration(migrations.Migration): + + dependencies = [ + ("contentcuration", "0166_add_usersubscription"), + ] + + operations = [ + migrations.AddField( + model_name="file", + name="file_size_bigint", + field=models.BigIntegerField(blank=True, null=True), + ), + migrations.AddIndex( + model_name="file", + index=models.Index( + fields=["checksum", "file_size_bigint"], + name="file_checksum_fsizebig_idx", + condition=Q(file_size_bigint__isnull=False), + ), + ), + pgtrigger.migrations.AddTrigger( + model_name="file", + trigger=pgtrigger.compiler.Trigger( + name="mirror_file_size_to_file_size_bigint", + sql=pgtrigger.compiler.UpsertTriggerSql( + func="IF NEW.file_size IS DISTINCT FROM OLD.file_size THEN NEW.file_size_bigint = NEW.file_size; END IF; RETURN NEW;", + hash="051e321c4cdf91ea81f96b9f9a29e3b5015def67", + operation="INSERT OR UPDATE", + pgid="pgtrigger_mirror_file_size_to_file_size_bigint_54326", + table="contentcuration_file", + when="BEFORE", + ), + ), + ), + ] diff --git a/contentcuration/contentcuration/models.py b/contentcuration/contentcuration/models.py index ae2ab2b615..1f0c1ec8c5 100644 --- a/contentcuration/contentcuration/models.py +++ b/contentcuration/contentcuration/models.py @@ -79,6 +79,7 @@ from contentcuration.constants import feedback from contentcuration.constants import user_history from contentcuration.constants.contentnode import kind_activity_map +from contentcuration.db.dual_write import mirror_field from contentcuration.db.models.expressions import Array from contentcuration.db.models.functions import ArrayRemove from contentcuration.db.models.functions import Unnest @@ -3255,6 +3256,8 @@ class StagedFile(models.Model): FILE_DISTINCT_INDEX_NAME = "file_checksum_file_size_idx" +# studio#5974: bigint shadow of FILE_DISTINCT_INDEX_NAME, for the file_size widening. +FILE_DISTINCT_BIGINT_INDEX_NAME = "file_checksum_fsizebig_idx" FILE_MODIFIED_DESC_INDEX_NAME = "file_modified_desc_idx" FILE_DURATION_CONSTRAINT = "file_media_duration_int" MEDIA_PRESETS = [ @@ -3266,6 +3269,14 @@ class StagedFile(models.Model): ] +# studio#5974 swap (next release, after backfill completes). One migration: +# - drop the @mirror_field decorator and the file_size_bigint field below +# - file_size = models.BigIntegerField(blank=True, null=True) +# - DB ops: drop the trigger + int file_size column, then RENAME file_size_bigint -> file_size +# - wrap in SeparateDatabaseAndState so the int->bigint AlterField is state-only (no rewrite) +# Transparent to old pods: they keep writing file_size (now bigint); only a brief metadata lock. +# Do NOT add db_column to reach file_size_bigint first — that generation breaks at the rename. +@mirror_field("file_size", "file_size_bigint") # studio#5974: dual-write int->bigint class File(models.Model): """ The bottom layer of the contentDB schema, defines the basic building brick for content. @@ -3275,6 +3286,9 @@ class File(models.Model): id = UUIDField(primary_key=True, default=uuid.uuid4) checksum = models.CharField(max_length=400, blank=True, db_index=True) file_size = models.IntegerField(blank=True, null=True) + file_size_bigint = models.BigIntegerField( + blank=True, null=True + ) # studio#5974 shadow file_on_disk = models.FileField( upload_to=object_storage_name, storage=default_storage, @@ -3485,6 +3499,11 @@ class Meta: models.Index( fields=["checksum", "file_size"], name=FILE_DISTINCT_INDEX_NAME ), + models.Index( + fields=["checksum", "file_size_bigint"], + name=FILE_DISTINCT_BIGINT_INDEX_NAME, + condition=Q(file_size_bigint__isnull=False), + ), models.Index(fields=["-modified"], name=FILE_MODIFIED_DESC_INDEX_NAME), ] constraints = [