openedx · bradenmacdonald · Mar 20, 2026 · Mar 21, 2026 · Mar 21, 2026
diff --git a/src/openedx_tagging/api.py b/src/openedx_tagging/api.py
@@ -22,11 +22,13 @@
 
 from .data import TagDataQuerySet
 from .models import ObjectTag, Tag, Taxonomy
-from .models.utils import ConcatNull, StringAgg
 
 # Export this as part of the API
 TagDoesNotExist = Tag.DoesNotExist
 
+# Maximum number of tags allowed on any one object
+OBJECT_MAX_TAGS = 100
+
 
 def create_taxonomy(  # pylint: disable=too-many-positional-arguments
     name: str,
@@ -198,15 +200,14 @@ def get_object_tags(
         base_qs
         # Preload related objects, including data for the "get_lineage" method on ObjectTag/Tag:
         .select_related("taxonomy", "tag", "tag__parent", "tag__parent__parent")
-        # Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this:
-        .annotate(sort_key=Lower(Concat(
-            ConcatNull(F("tag__parent__parent__parent__value"), Value("\t")),
-            ConcatNull(F("tag__parent__parent__value"), Value("\t")),
-            ConcatNull(F("tag__parent__value"), Value("\t")),
-            Coalesce(F("tag__value"), F("_value")),
-            Value("\t"),
-            output_field=models.CharField(),
-        )))
+        # Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this.
+        # tag__lineage is a case-insensitive column storing the full ancestor path, e.g. "Root\tParent\tThis\t".
+        # Free-text and deleted tags (tag_id IS NULL) fall back to their cached _value.
+        .annotate(sort_key=Coalesce(
+            Lower(F("tag__lineage")),
+            Lower(Concat(F("_value"), Value("\t"))),
+            output_field=models.TextField(),
+        ))
         .annotate(taxonomy_name=Coalesce(F("taxonomy__name"), F("_export_id")))
         # Sort first by taxonomy name, then by tag value in tree order:
         .order_by("taxonomy_name", "sort_key")
@@ -222,6 +223,9 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[
 
     Deleted tags and disabled taxonomies are excluded from the counts, even if
     ObjectTag data about them is present.
+
+    count_implicit: if True, this means to count all ancestor tags (implicit
+    tags) of the explict tags that are associated with the object.
     """
     # Note: in the future we may add an option to exclude system taxonomies from the count.
     qs: Any = ObjectTag.objects
@@ -236,32 +240,30 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[
     qs = qs.exclude(taxonomy__enabled=False)  # The whole taxonomy is disabled
     qs = qs.exclude(tag_id=None, taxonomy__allow_free_text=False)  # The taxonomy exists but the tag is deleted
     if count_implicit:
-        # Counting the implicit tags is tricky, because if two "grandchild" tags have the same implicit parent tag, we
-        # need to count that parent tag only once. To do that, we collect all the ancestor tag IDs into an aggregate
-        # string, and then count the unique values using python
-        qs = qs.values("object_id").annotate(
-            num_tags=models.Count("id"),
-            tag_ids_str_1=StringAgg("tag_id"),
-            tag_ids_str_2=StringAgg("tag__parent_id"),
-            tag_ids_str_3=StringAgg("tag__parent__parent_id"),
-            tag_ids_str_4=StringAgg("tag__parent__parent__parent_id"),
-        ).order_by("object_id")
-        result = {}
+        # Use tag__lineage to count implicit (ancestor) tags at any depth.
+        # Each tag's lineage encodes its full ancestry path, e.g. "root\tchild\tgrandchild\t".
+        # Every prefix of that path (at each \t boundary) uniquely identifies one tag in the chain,
+        # so collecting prefixes into a set naturally deduplicates shared ancestors across multiple
+        # tags on the same object.
+        qs = qs.annotate(sort_key=F("tag__lineage")).values("object_id", "sort_key")
+        result: dict = {}
         for row in qs:
-            # ObjectTags for free text taxonomies will be included in "num_tags" count, but not "tag_ids_str_1" since
-            # they have no tag ID. We can compute how many free text tags each object has now:
-            if row["tag_ids_str_1"]:
-                num_free_text_tags = row["num_tags"] - len(row["tag_ids_str_1"].split(","))
+            object_id = row["object_id"]
+            if object_id not in result:
+                result[object_id] = {"free_text": 0, "paths": set()}
+            sort_key = row["sort_key"]
+            if sort_key is None:
+                # Free-text tag: no Tag record, so no sort_key
+                result[object_id]["free_text"] += 1
             else:
-                num_free_text_tags = row["num_tags"]
-            # Then we count the total number of *unique* Tags for this object, both implicit and explicit:
-            other_tag_ids = set()
-            for field in ("tag_ids_str_1", "tag_ids_str_2", "tag_ids_str_3", "tag_ids_str_4"):
-                if row[field] is not None:
-                    for tag_id in row[field].split(","):
-                        other_tag_ids.add(int(tag_id))
-            result[row["object_id"]] = num_free_text_tags + len(other_tag_ids)
-        return result
+                # Add the sort_key prefix for each ancestor level
+                parts = sort_key.rstrip("\t").split("\t")
+                for i in range(1, len(parts) + 1):
+                    result[object_id]["paths"].add("\t".join(parts[:i]))
+        return {
+            object_id: data["free_text"] + len(data["paths"])
+            for object_id, data in result.items()
+        }
     else:
         qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id")
         return {row["object_id"]: row["num_tags"] for row in qs}
@@ -283,17 +285,17 @@ def _check_new_tag_count(
     taxonomy_export_id: str | None = None,
 ) -> None:
     """
-    Checks if the new count of tags for the object is equal or less than 100
+    Checks if the new count of tags for the object is equal or less than OBJECT_MAX_TAGS
     """
     # Exclude to avoid counting the tags that are going to be updated
     if taxonomy:
         current_count = ObjectTag.objects.filter(object_id=object_id).exclude(taxonomy_id=taxonomy.id).count()
     else:
         current_count = ObjectTag.objects.filter(object_id=object_id).exclude(_export_id=taxonomy_export_id).count()
 
-    if current_count + new_tag_count > 100:
+    if current_count + new_tag_count > OBJECT_MAX_TAGS:
         raise ValueError(
-            _("Cannot add more than 100 tags to ({object_id}).").format(object_id=object_id)
+            _("Cannot add more than {limit} tags to ({object_id}).").format(object_id=object_id, limit=OBJECT_MAX_TAGS)
         )
 
 

diff --git a/src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py b/src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py
@@ -0,0 +1,163 @@
+"""
+(1) Add a concrete 'depth' column to the oel_tagging_tag table.
+
+The depth column stores:
+  - 0 for root tags (parent IS NULL)
+  - parent.depth + 1 for all other tags
+
+A CHECK constraint enforces this invariant at the database level:
+  parent_id IS NULL OR depth > 0
+
+(2) Add a concrete 'lineage' column to the oel_tagging_tag table.
+
+The lineage column stores the full tab-separated ancestor path including the
+tag itself:
+
+    "RootValue\\tParentValue\\t...\\tThisValue\\t"
+
+with original casing and a trailing tab delimiter. Because the column uses a
+case-insensitive collation, ORDER BY lineage gives the depth-first tree order
+that we want when querying the taxonomy tree.
+
+The trailing tab makes prefix matching unambiguous: every descendant of tag T
+has a lineage that starts with T.lineage (since T.lineage ends with '\\t' and
+no tag value can contain '\\t').
+"""
+
+from django.db import migrations, models
+from django.db.models.functions import Concat, Length, Replace
+
+import openedx_django_lib.fields
+
+
+def populate_depth_and_lineage(apps, _schema_editor):
+    """
+    Populate the new `depth` and `lineage` columns for all existing tags by
+    walking the hierarchy one level at a time (root tags first, then their
+    children, etc.).
+    """
+    Tag = apps.get_model("oel_tagging", "Tag")
+    # Root tags: depth 0, lineage = "value\t"
+    for tag in Tag.objects.filter(parent__isnull=True).only("id", "value"):
+        Tag.objects.filter(pk=tag.pk).update(depth=0, lineage=tag.value + "\t")
+    # Walk down the tree one level at a time.
+    for level in range(1, 20):  # Depth should be at most 3 or 4, but it doesn't hurt to be thorough.
+        children = list(
+            Tag.objects.filter(parent__depth=level - 1).select_related("parent").only("id", "value", "parent__lineage")
+        )
+        if not children:
+            break
+        for tag in children:
+            Tag.objects.filter(pk=tag.pk).update(
+                depth=level,
+                lineage=tag.parent.lineage + tag.value + "\t",
+            )
+
+
+def reverse_populate_depth_and_lineage(_apps, _schema_editor):
+    pass  # Both fields are dropped on reverse, so no cleanup needed.
+
+
+def _create_lineage_index(_apps, schema_editor):
+    """
+    Create an index on the lineage column.
+
+    MySQL's InnoDB limits index key length to 3072 bytes; with utf8mb4 (up to
+    4 bytes per character) a full-column index on a VARCHAR(3006) would require
+    up to 12,024 bytes — far over the limit.  We therefore use a 768-character
+    prefix on MySQL (768 × 4 = 3072 bytes, exactly at the limit) and a regular
+    full-column index on SQLite and PostgreSQL.
+    """
+    if schema_editor.connection.vendor == "mysql":
+        schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage(768))")
+    else:
+        schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage)")
+
+
+def _drop_lineage_index(_apps, schema_editor):
+    if schema_editor.connection.vendor == "mysql":
+        schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag")
+    else:
+        schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx")
+
+
+class Migration(migrations.Migration):
+    """Add depth and lineage columns to Tag; remove the oel_tagging_tag_computed view."""
+
+    # Even though this migration no longer creates a view, we keep atomic=False
+    # as a safety measure since this migration touched DDL on MySQL in its prior form.
+    atomic = False
+
+    dependencies = [
+        ("oel_tagging", "0019_language_taxonomy_class"),
+    ]
+
+    operations = [
+        # 1. Add the depth column to oel_tagging_tag with a safe default of 0.
+        migrations.AddField(
+            model_name="tag",
+            name="depth",
+            field=models.IntegerField(
+                default=0,
+                help_text="Number of ancestors this tag has. Zero for root tags, one for their children, and so on. Set automatically by save(); do not set manually.",
+            ),
+        ),
+        # 2. Add the lineage column with an empty default (populated below).
+        migrations.AddField(
+            model_name="tag",
+            name="lineage",
+            field=openedx_django_lib.fields.MultiCollationCharField(
+                db_collations={"mysql": "utf8mb4_unicode_ci", "sqlite": "NOCASE"},
+                default="",
+                help_text="Tab-separated ancestor path including this tag: 'Root\\tParent\\t...\\tThisValue\\t'. Used for depth-first tree ordering and descendant prefix matching. Set automatically by save(); do not set manually.",
+                max_length=3006,
+            ),
+        ),
+        # 3. Populate depth and lineage for all pre-existing tags.
+        migrations.RunPython(populate_depth_and_lineage, reverse_populate_depth_and_lineage, elidable=False),
+        # 4. Add CHECK constraints, once we've populated the values.
+        migrations.AddConstraint(
+            model_name="tag",
+            constraint=models.CheckConstraint(
+                condition=(models.Q(parent_id__isnull=True) | models.Q(depth__gt=0)),
+                name="oel_tagging_tag_depth_parent_check",
+            ),
+        ),
+        migrations.AddConstraint(
+            model_name="tag",
+            constraint=models.CheckConstraint(
+                condition=models.Q(lineage__endswith=Concat(models.F("value"), models.Value("\t"))),
+                name="oel_tagging_tag_lineage_ends_with_value",
+            ),
+        ),
+        migrations.AddConstraint(
+            model_name="tag",
+            constraint=models.CheckConstraint(
+                condition=models.Q(
+                    depth=(
+                        Length(models.F("lineage"))
+                        - Length(Replace(models.F("lineage"), models.Value("\t"), models.Value("")))
+                        - 1
+                    )
+                ),
+                name="oel_tagging_tag_lineage_tab_count_check",
+            ),
+        ),
+        # 5. Add index on lineage after data is populated, so the build scans real values.
+        #    MySQL's InnoDB limits index keys to 3072 bytes; with utf8mb4 (4 bytes/char) that
+        #    caps a full-column index at 768 chars — far shorter than max_length=3006.  We use
+        #    SeparateDatabaseAndState so Django's migration state records the index normally
+        #    (avoiding spurious makemigrations noise) while the actual SQL uses a 768-char
+        #    prefix on MySQL and a regular full-column index everywhere else.
+        migrations.SeparateDatabaseAndState(
+            state_operations=[
+                migrations.AddIndex(
+                    model_name="tag",
+                    index=models.Index(fields=["lineage"], name="oel_tagging_lineage_d65f82_idx"),
+                ),
+            ],
+            database_operations=[
+                migrations.RunPython(_create_lineage_index, _drop_lineage_index, elidable=False),
+            ],
+        ),
+    ]