diff --git a/src/openedx_tagging/api.py b/src/openedx_tagging/api.py index 767915e7..cb80755b 100644 --- a/src/openedx_tagging/api.py +++ b/src/openedx_tagging/api.py @@ -22,11 +22,13 @@ from .data import TagDataQuerySet from .models import ObjectTag, Tag, Taxonomy -from .models.utils import ConcatNull, StringAgg # Export this as part of the API TagDoesNotExist = Tag.DoesNotExist +# Maximum number of tags allowed on any one object +OBJECT_MAX_TAGS = 100 + def create_taxonomy( # pylint: disable=too-many-positional-arguments name: str, @@ -198,15 +200,14 @@ def get_object_tags( base_qs # Preload related objects, including data for the "get_lineage" method on ObjectTag/Tag: .select_related("taxonomy", "tag", "tag__parent", "tag__parent__parent") - # Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this: - .annotate(sort_key=Lower(Concat( - ConcatNull(F("tag__parent__parent__parent__value"), Value("\t")), - ConcatNull(F("tag__parent__parent__value"), Value("\t")), - ConcatNull(F("tag__parent__value"), Value("\t")), - Coalesce(F("tag__value"), F("_value")), - Value("\t"), - output_field=models.CharField(), - ))) + # Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this. + # tag__lineage is a case-insensitive column storing the full ancestor path, e.g. "Root\tParent\tThis\t". + # Free-text and deleted tags (tag_id IS NULL) fall back to their cached _value. + .annotate(sort_key=Coalesce( + Lower(F("tag__lineage")), + Lower(Concat(F("_value"), Value("\t"))), + output_field=models.TextField(), + )) .annotate(taxonomy_name=Coalesce(F("taxonomy__name"), F("_export_id"))) # Sort first by taxonomy name, then by tag value in tree order: .order_by("taxonomy_name", "sort_key") @@ -222,6 +223,9 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[ Deleted tags and disabled taxonomies are excluded from the counts, even if ObjectTag data about them is present. + + count_implicit: if True, this means to count all ancestor tags (implicit + tags) of the explict tags that are associated with the object. """ # Note: in the future we may add an option to exclude system taxonomies from the count. qs: Any = ObjectTag.objects @@ -236,32 +240,30 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[ qs = qs.exclude(taxonomy__enabled=False) # The whole taxonomy is disabled qs = qs.exclude(tag_id=None, taxonomy__allow_free_text=False) # The taxonomy exists but the tag is deleted if count_implicit: - # Counting the implicit tags is tricky, because if two "grandchild" tags have the same implicit parent tag, we - # need to count that parent tag only once. To do that, we collect all the ancestor tag IDs into an aggregate - # string, and then count the unique values using python - qs = qs.values("object_id").annotate( - num_tags=models.Count("id"), - tag_ids_str_1=StringAgg("tag_id"), - tag_ids_str_2=StringAgg("tag__parent_id"), - tag_ids_str_3=StringAgg("tag__parent__parent_id"), - tag_ids_str_4=StringAgg("tag__parent__parent__parent_id"), - ).order_by("object_id") - result = {} + # Use tag__lineage to count implicit (ancestor) tags at any depth. + # Each tag's lineage encodes its full ancestry path, e.g. "root\tchild\tgrandchild\t". + # Every prefix of that path (at each \t boundary) uniquely identifies one tag in the chain, + # so collecting prefixes into a set naturally deduplicates shared ancestors across multiple + # tags on the same object. + qs = qs.annotate(sort_key=F("tag__lineage")).values("object_id", "sort_key") + result: dict = {} for row in qs: - # ObjectTags for free text taxonomies will be included in "num_tags" count, but not "tag_ids_str_1" since - # they have no tag ID. We can compute how many free text tags each object has now: - if row["tag_ids_str_1"]: - num_free_text_tags = row["num_tags"] - len(row["tag_ids_str_1"].split(",")) + object_id = row["object_id"] + if object_id not in result: + result[object_id] = {"free_text": 0, "paths": set()} + sort_key = row["sort_key"] + if sort_key is None: + # Free-text tag: no Tag record, so no sort_key + result[object_id]["free_text"] += 1 else: - num_free_text_tags = row["num_tags"] - # Then we count the total number of *unique* Tags for this object, both implicit and explicit: - other_tag_ids = set() - for field in ("tag_ids_str_1", "tag_ids_str_2", "tag_ids_str_3", "tag_ids_str_4"): - if row[field] is not None: - for tag_id in row[field].split(","): - other_tag_ids.add(int(tag_id)) - result[row["object_id"]] = num_free_text_tags + len(other_tag_ids) - return result + # Add the sort_key prefix for each ancestor level + parts = sort_key.rstrip("\t").split("\t") + for i in range(1, len(parts) + 1): + result[object_id]["paths"].add("\t".join(parts[:i])) + return { + object_id: data["free_text"] + len(data["paths"]) + for object_id, data in result.items() + } else: qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id") return {row["object_id"]: row["num_tags"] for row in qs} @@ -283,7 +285,7 @@ def _check_new_tag_count( taxonomy_export_id: str | None = None, ) -> None: """ - Checks if the new count of tags for the object is equal or less than 100 + Checks if the new count of tags for the object is equal or less than OBJECT_MAX_TAGS """ # Exclude to avoid counting the tags that are going to be updated if taxonomy: @@ -291,9 +293,9 @@ def _check_new_tag_count( else: current_count = ObjectTag.objects.filter(object_id=object_id).exclude(_export_id=taxonomy_export_id).count() - if current_count + new_tag_count > 100: + if current_count + new_tag_count > OBJECT_MAX_TAGS: raise ValueError( - _("Cannot add more than 100 tags to ({object_id}).").format(object_id=object_id) + _("Cannot add more than {limit} tags to ({object_id}).").format(object_id=object_id, limit=OBJECT_MAX_TAGS) ) diff --git a/src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py b/src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py new file mode 100644 index 00000000..9eabda27 --- /dev/null +++ b/src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py @@ -0,0 +1,163 @@ +""" +(1) Add a concrete 'depth' column to the oel_tagging_tag table. + +The depth column stores: + - 0 for root tags (parent IS NULL) + - parent.depth + 1 for all other tags + +A CHECK constraint enforces this invariant at the database level: + parent_id IS NULL OR depth > 0 + +(2) Add a concrete 'lineage' column to the oel_tagging_tag table. + +The lineage column stores the full tab-separated ancestor path including the +tag itself: + + "RootValue\\tParentValue\\t...\\tThisValue\\t" + +with original casing and a trailing tab delimiter. Because the column uses a +case-insensitive collation, ORDER BY lineage gives the depth-first tree order +that we want when querying the taxonomy tree. + +The trailing tab makes prefix matching unambiguous: every descendant of tag T +has a lineage that starts with T.lineage (since T.lineage ends with '\\t' and +no tag value can contain '\\t'). +""" + +from django.db import migrations, models +from django.db.models.functions import Concat, Length, Replace + +import openedx_django_lib.fields + + +def populate_depth_and_lineage(apps, _schema_editor): + """ + Populate the new `depth` and `lineage` columns for all existing tags by + walking the hierarchy one level at a time (root tags first, then their + children, etc.). + """ + Tag = apps.get_model("oel_tagging", "Tag") + # Root tags: depth 0, lineage = "value\t" + for tag in Tag.objects.filter(parent__isnull=True).only("id", "value"): + Tag.objects.filter(pk=tag.pk).update(depth=0, lineage=tag.value + "\t") + # Walk down the tree one level at a time. + for level in range(1, 20): # Depth should be at most 3 or 4, but it doesn't hurt to be thorough. + children = list( + Tag.objects.filter(parent__depth=level - 1).select_related("parent").only("id", "value", "parent__lineage") + ) + if not children: + break + for tag in children: + Tag.objects.filter(pk=tag.pk).update( + depth=level, + lineage=tag.parent.lineage + tag.value + "\t", + ) + + +def reverse_populate_depth_and_lineage(_apps, _schema_editor): + pass # Both fields are dropped on reverse, so no cleanup needed. + + +def _create_lineage_index(_apps, schema_editor): + """ + Create an index on the lineage column. + + MySQL's InnoDB limits index key length to 3072 bytes; with utf8mb4 (up to + 4 bytes per character) a full-column index on a VARCHAR(3006) would require + up to 12,024 bytes — far over the limit. We therefore use a 768-character + prefix on MySQL (768 × 4 = 3072 bytes, exactly at the limit) and a regular + full-column index on SQLite and PostgreSQL. + """ + if schema_editor.connection.vendor == "mysql": + schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage(768))") + else: + schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage)") + + +def _drop_lineage_index(_apps, schema_editor): + if schema_editor.connection.vendor == "mysql": + schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag") + else: + schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx") + + +class Migration(migrations.Migration): + """Add depth and lineage columns to Tag; remove the oel_tagging_tag_computed view.""" + + # Even though this migration no longer creates a view, we keep atomic=False + # as a safety measure since this migration touched DDL on MySQL in its prior form. + atomic = False + + dependencies = [ + ("oel_tagging", "0019_language_taxonomy_class"), + ] + + operations = [ + # 1. Add the depth column to oel_tagging_tag with a safe default of 0. + migrations.AddField( + model_name="tag", + name="depth", + field=models.IntegerField( + default=0, + help_text="Number of ancestors this tag has. Zero for root tags, one for their children, and so on. Set automatically by save(); do not set manually.", + ), + ), + # 2. Add the lineage column with an empty default (populated below). + migrations.AddField( + model_name="tag", + name="lineage", + field=openedx_django_lib.fields.MultiCollationCharField( + db_collations={"mysql": "utf8mb4_unicode_ci", "sqlite": "NOCASE"}, + default="", + help_text="Tab-separated ancestor path including this tag: 'Root\\tParent\\t...\\tThisValue\\t'. Used for depth-first tree ordering and descendant prefix matching. Set automatically by save(); do not set manually.", + max_length=3006, + ), + ), + # 3. Populate depth and lineage for all pre-existing tags. + migrations.RunPython(populate_depth_and_lineage, reverse_populate_depth_and_lineage, elidable=False), + # 4. Add CHECK constraints, once we've populated the values. + migrations.AddConstraint( + model_name="tag", + constraint=models.CheckConstraint( + condition=(models.Q(parent_id__isnull=True) | models.Q(depth__gt=0)), + name="oel_tagging_tag_depth_parent_check", + ), + ), + migrations.AddConstraint( + model_name="tag", + constraint=models.CheckConstraint( + condition=models.Q(lineage__endswith=Concat(models.F("value"), models.Value("\t"))), + name="oel_tagging_tag_lineage_ends_with_value", + ), + ), + migrations.AddConstraint( + model_name="tag", + constraint=models.CheckConstraint( + condition=models.Q( + depth=( + Length(models.F("lineage")) + - Length(Replace(models.F("lineage"), models.Value("\t"), models.Value(""))) + - 1 + ) + ), + name="oel_tagging_tag_lineage_tab_count_check", + ), + ), + # 5. Add index on lineage after data is populated, so the build scans real values. + # MySQL's InnoDB limits index keys to 3072 bytes; with utf8mb4 (4 bytes/char) that + # caps a full-column index at 768 chars — far shorter than max_length=3006. We use + # SeparateDatabaseAndState so Django's migration state records the index normally + # (avoiding spurious makemigrations noise) while the actual SQL uses a 768-char + # prefix on MySQL and a regular full-column index everywhere else. + migrations.SeparateDatabaseAndState( + state_operations=[ + migrations.AddIndex( + model_name="tag", + index=models.Index(fields=["lineage"], name="oel_tagging_lineage_d65f82_idx"), + ), + ], + database_operations=[ + migrations.RunPython(_create_lineage_index, _drop_lineage_index, elidable=False), + ], + ), + ] diff --git a/src/openedx_tagging/models/base.py b/src/openedx_tagging/models/base.py index 668ba73e..870a3df2 100644 --- a/src/openedx_tagging/models/base.py +++ b/src/openedx_tagging/models/base.py @@ -5,30 +5,28 @@ import logging import re -from typing import List +from typing import List, Self from django.core.exceptions import ValidationError from django.db import models -from django.db.models import F, Q, Value -from django.db.models.functions import Concat, Lower +from django.db.models import F, Value +from django.db.models.functions import Concat, Length, Replace, Substr from django.utils.functional import cached_property from django.utils.module_loading import import_string from django.utils.translation import gettext_lazy as _ -from typing_extensions import Self # Until we upgrade to python 3.11 from openedx_django_lib.fields import MultiCollationTextField, case_insensitive_char_field, case_sensitive_char_field from ..data import TagDataQuerySet -from .utils import RESERVED_TAG_CHARS, ConcatNull +from .utils import RESERVED_TAG_CHARS log = logging.getLogger(__name__) - -# Maximum depth allowed for a hierarchical taxonomy's tree of tags. -TAXONOMY_MAX_DEPTH = 3 +# Maximum depth of tags that can be created. Internally, the system has no depth limits, but for reasonable performance +# guarantees we enforce this depth. Note: depth is zero-indexed so "5" means 6 levels of depth are allowed. +TAXONOMY_MAX_DEPTH = 5 # Ancestry of a given tag; the Tag.value fields of a given tag and its parents, starting from the root. -# Will contain 0...TAXONOMY_MAX_DEPTH elements. Lineage = List[str] @@ -74,16 +72,55 @@ class Tag(models.Model): "Used to link an Open edX Tag with a tag in an externally-defined taxonomy." ), ) + depth = models.IntegerField( + default=0, + help_text=_( + "Number of ancestors this tag has. Zero for root tags, one for their children, and so on." + " Set automatically by save(); do not set manually." + ), + ) + lineage = case_insensitive_char_field( + max_length=3006, + default="", + help_text=_( + "Tab-separated ancestor path including this tag: 'Root\\tParent\\t...\\tThisValue\\t'." + " Used for depth-first tree ordering and descendant prefix matching." + " Set automatically by save(); do not set manually." + ), + ) class Meta: indexes = [ models.Index(fields=["taxonomy", "value"]), models.Index(fields=["taxonomy", "external_id"]), + models.Index(fields=["lineage"]), ] unique_together = [ ["taxonomy", "external_id"], ["taxonomy", "value"], ] + constraints = [ + # Enforce that tags with a parent always have a positive `depth`. + # Note: we intentionally only enforce one direction here; enforcing a stricter condition (that when parent + # is NULL, depth must be zero) unfortunately causes deletes to fail with an integrity error, when the delete + # operation pre-sets the child tag's foreign key to NULL before cascading the delete on MySQL. + models.CheckConstraint( + condition=models.Q(parent_id__isnull=True) | models.Q(depth__gt=0), + name="oel_tagging_tag_depth_parent_check", + ), + # Enforce that the lineage ends with "{value}\t" + models.CheckConstraint( + condition=models.Q(lineage__endswith=Concat(F("value"), Value("\t"))), + name="oel_tagging_tag_lineage_ends_with_value", + ), + # Verify that the lineage column contains exactly [depth + 1] TAB (\t) characters: + models.CheckConstraint( + condition=models.Q( + depth=Length(F("lineage")) - Length(Replace(F("lineage"), Value("\t"), Value(""))) - 1 + ), + name="oel_tagging_tag_lineage_tab_count_check", + ), + ] def __repr__(self): """ @@ -111,56 +148,7 @@ def get_lineage(self) -> Lineage: The root Tag.value is first, followed by its child.value, and on down to self.value. """ - lineage: Lineage = [self.value] - next_ancestor = self.get_next_ancestor() - while next_ancestor: - lineage.insert(0, next_ancestor.value) - next_ancestor = next_ancestor.get_next_ancestor() - return lineage - - def get_next_ancestor(self) -> Tag | None: - """ - Fetch the parent of this Tag. - - While doing so, preload several ancestors at the same time, so we can - use fewer database queries than the basic approach of iterating through - parent.parent.parent... - """ - if self.parent_id is None: - return None - if not Tag.parent.is_cached(self): # pylint: disable=no-member - # Parent is not yet loaded. Retrieve our parent, grandparent, and great-grandparent in one query. - # This is not actually changing the parent, just loading it and caching it. - self.parent = Tag.objects.select_related("parent", "parent__parent").get(pk=self.parent_id) - return self.parent - - @cached_property - def depth(self) -> int: - """ - How many ancestors this Tag has. Zero for root tags. - """ - depth = 0 - tag = self - while tag.parent: - depth += 1 - tag = tag.parent - return depth - - @staticmethod - def annotate_depth(qs: models.QuerySet) -> models.QuerySet: - """ - Given a query that loads Tag objects, annotate it with the depth of - each tag. - """ - return qs.annotate(depth=models.Case( - models.When(parent_id=None, then=0), - models.When(parent__parent_id=None, then=1), - models.When(parent__parent__parent_id=None, then=2), - models.When(parent__parent__parent__parent_id=None, then=3), - # If the depth is 4 or more, currently we just "collapse" the depth - # to 4 in order not to add too many joins to this query in general. - default=4, - )) + return self.lineage.rstrip("\t").split("\t") @cached_property def child_count(self) -> int: @@ -177,12 +165,48 @@ def descendant_count(self) -> int: How many descendant tags this tag has in the taxonomy. """ if self.taxonomy and not self.taxonomy.allow_free_text: + # depth__gt correctly excludes self; lineage prefix matches all descendants. return self.taxonomy.tag_set.filter( - Q(parent__parent=self) | - Q(parent__parent__parent=self) - ).count() + self.child_count + depth__gt=self.depth, + lineage__startswith=self.lineage, + ).count() return 0 + def save(self, *args, **kwargs): + """ + Compute and persist depth and lineage before saving, then cascade any changes to descendants. + """ + self.clean() + old_values = ( + Tag.objects.filter(pk=self.pk).values("depth", "lineage").first() + if self.pk else None + ) + if self.parent_id is None: + self.depth = 0 + self.lineage = self.value + "\t" + else: + if Tag.parent.is_cached(self): # pylint: disable=no-member + parent_depth = self.parent.depth + parent_lineage = self.parent.lineage + else: + parent_vals = Tag.objects.values("depth", "lineage").get(pk=self.parent_id) + parent_depth = parent_vals["depth"] + parent_lineage = parent_vals["lineage"] + self.depth = parent_depth + 1 + self.lineage = parent_lineage + self.value + "\t" + super().save(*args, **kwargs) + # Cascade lineage (and depth, if it changed) to all descendants in a single UPDATE. + if old_values is not None and old_values["lineage"] and old_values["lineage"] != self.lineage: + depth_delta = self.depth - old_values["depth"] + update_kwargs: dict = { + "lineage": Concat(Value(self.lineage), Substr(F("lineage"), len(old_values["lineage"]) + 1)), + } + if depth_delta != 0: + update_kwargs["depth"] = F("depth") + depth_delta + self.taxonomy.tag_set.filter( + lineage__startswith=old_values["lineage"] + ).exclude(pk=self.pk).update(**update_kwargs) + def clean(self): """ Validate this tag before saving @@ -384,7 +408,7 @@ def copy(self, taxonomy: Taxonomy) -> Taxonomy: def get_filtered_tags( # pylint: disable=too-many-positional-arguments self, - depth: int | None = TAXONOMY_MAX_DEPTH, + depth: int | None = None, parent_tag_value: str | None = None, search_term: str | None = None, include_counts: bool = False, @@ -398,8 +422,7 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments By default returns all the tags of the given taxonomy Use `depth=1` to return a single level of tags, without any child - tags included. Use `depth=None` or `depth=TAXONOMY_MAX_DEPTH` to return - all descendants of the tags, up to our maximum supported depth. + tags included. Use `depth=None` to return all descendants of the tags. Use `parent_tag_value` to return only the children/descendants of a specific tag. @@ -428,7 +451,7 @@ def get_filtered_tags( # pylint: disable=too-many-positional-arguments return result.exclude(value__in=excluded_values) else: return result - elif depth is None or depth == TAXONOMY_MAX_DEPTH: + elif depth is None: return self._get_filtered_tags_deep( parent_tag_value=parent_tag_value, search_term=search_term, @@ -482,18 +505,20 @@ def _get_filtered_tags_one_level( if parent_tag_value: parent_tag = self.tag_for_value(parent_tag_value) qs: models.QuerySet = self.tag_set.filter(parent_id=parent_tag.pk) - qs = qs.annotate(depth=Value(parent_tag.depth + 1)) # Use parent_tag.value not parent_tag_value because they may differ in case qs = qs.annotate(parent_value=Value(parent_tag.value)) else: - qs = self.tag_set.filter(parent=None).annotate(depth=Value(0)) # type: ignore[no-redef] + qs = self.tag_set.filter(parent=None) qs = qs.annotate(parent_value=Value(None, output_field=models.CharField())) qs = qs.annotate(child_count=models.Count("children", distinct=True)) # type: ignore[no-redef] - qs = qs.annotate(grandchild_count=models.Count("children__children", distinct=True)) - qs = qs.annotate(great_grandchild_count=models.Count("children__children__children")) - qs = qs.annotate( - descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count") - ) # type: ignore[no-redef] + # Count all descendants at any depth using depth + lineage prefix. + # depth__gt correctly excludes self; lineage prefix matches all descendants. + descendants_sq = ( + self.tag_set.filter(depth__gt=models.OuterRef("depth"), lineage__startswith=models.OuterRef("lineage")) + .order_by() + .annotate(count=models.Func(F("id"), function="Count")) + ) + qs = qs.annotate(descendant_count=models.Subquery(descendants_sq.values("count"))) # type: ignore[no-redef] # Filter by search term: if search_term: qs = qs.filter(value__icontains=search_term) @@ -524,24 +549,30 @@ def _get_filtered_tags_deep( Implementation of get_filtered_tags() for closed taxonomies, where we're including tags from multiple levels of the hierarchy. """ - # All tags (possibly below a certain tag?) in the closed taxonomy, up to depth TAXONOMY_MAX_DEPTH + # Note: we ignore a lot of "no-redef" warnings here because we use annotations to pre-load fields like + # `child_count`, and `descendant_count` for all tags in a single query rather than computing them later for each + # Tag, with additional queries. Also, we are converting the result to a values query (that returns a dict), not + # returning actual Tag objects at the end, but mypy doesn't know that. + if parent_tag_value: - main_parent_id = self.tag_for_value(parent_tag_value).pk + # Get a subtree below this tag: + main_parent_tag = self.tag_for_value(parent_tag_value) + initial_qs = self.tag_set.filter( + lineage__startswith=main_parent_tag.lineage, + depth__gt=main_parent_tag.depth, + ) else: - main_parent_id = None - - assert TAXONOMY_MAX_DEPTH == 3 # If we change TAXONOMY_MAX_DEPTH we need to change this query code: - qs: models.QuerySet = self.tag_set.filter( - Q(parent_id=main_parent_id) | - Q(parent__parent_id=main_parent_id) | - Q(parent__parent__parent_id=main_parent_id) - ) + initial_qs = self.tag_set.all() if search_term: # We need to do an additional query to find all the tags that match the search term, then limit the # search to those tags and their ancestors. - matching_tags = qs.filter(value__icontains=search_term).values( - 'id', 'parent_id', 'parent__parent_id', 'parent__parent__parent_id' + matching_tags = initial_qs.filter(value__icontains=search_term).values( + 'id', 'parent_id', 'parent__parent_id', 'parent__parent__parent_id', + # Note: ancestors beyond parent__parent__parent get handled in the loop below, albeit with extra queries + # It's possible to refactor this to support unlimited depth in a single query using lineage, but + # it's too slow. Doing additional queries in the case of high depth is an acceptable trade-off for + # better performance overall and with shallower depths. ) if excluded_values: matching_tags = matching_tags.exclude(value__in=excluded_values) @@ -550,48 +581,53 @@ def _get_filtered_tags_deep( for pk in row.values(): if pk is not None: matching_ids.append(pk) - qs = qs.filter(pk__in=matching_ids) - qs = qs.annotate( - child_count=models.Count("children", filter=Q(children__pk__in=matching_ids), distinct=True), - grandchild_count=models.Count( - "children__children", filter=Q(children__children__pk__in=matching_ids), distinct=True, - ), - great_grandchild_count=models.Count( - "children__children__children", - filter=Q(children__children__children__pk__in=matching_ids), - ), - ) - qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count")) + next_ancestor_id = row["parent__parent__parent_id"] + while next_ancestor_id: # If there are even deeper ancestors, add them (inefficiently): + next_ancestor_id = Tag.objects.get(pk=next_ancestor_id).parent_id + matching_ids.append(next_ancestor_id) + + initial_qs = initial_qs.filter(pk__in=matching_ids) elif excluded_values: raise NotImplementedError("Using excluded_values without search_term is not currently supported.") # We could implement this in the future but I'd prefer to get rid of the "excluded_values" API altogether. # It remains to be seen if it's useful to do that on the backend, or if we can do it better/simpler on the # frontend. - else: - qs = qs.annotate(child_count=models.Count("children", distinct=True)) - qs = qs.annotate(grandchild_count=models.Count("children__children", distinct=True)) - qs = qs.annotate(great_grandchild_count=models.Count("children__children__children")) - qs = qs.annotate(descendant_count=F("child_count") + F("grandchild_count") + F("great_grandchild_count")) - - # Add the "depth" to each tag: - qs = Tag.annotate_depth(qs) - # Add the "lineage" as a field called "sort_key" to sort them in order correctly: - qs = qs.annotate(sort_key=Lower(Concat( - # For a root tag, we want sort_key="RootValue" and for a depth=1 tag - # we want sort_key="RootValue\tValue". The following does that, since - # ConcatNull(...) returns NULL if any argument is NULL. - ConcatNull(F("parent__parent__parent__value"), Value("\t")), - ConcatNull(F("parent__parent__value"), Value("\t")), - ConcatNull(F("parent__value"), Value("\t")), - F("value"), - Value("\t"), # We also need the '\t' separator character at the end, or MySQL will sort things wrong - output_field=models.CharField(), - ))) + + # Count the direct children, and annotate the result on each row as "child_count". + # The query below produces the same results as: + # qs = initial_qs.annotate(child_count=models.Count("children")) + # However, this correlated subquery avoids a JOIN + GROUP BY, and is far more efficient in practice. + # This also lets us use the same code path whether there's a search_term or not. + child_count_sq = ( + initial_qs + .filter(parent_id=models.OuterRef("pk")) + .order_by() + .annotate(count=models.Func(F("id"), function="Count")) + ) + # Count all descendants at any depth using the lineage prefix trick. + descendants_sq = ( + initial_qs + .filter( + depth__gt=models.OuterRef("depth"), + lineage__startswith=models.OuterRef("lineage"), + ) + .order_by() + .annotate(count=models.Func(F("id"), function="Count")) + ) + qs = initial_qs.annotate( # type: ignore[no-redef] + child_count=models.Subquery(child_count_sq.values("count")), + descendant_count=models.Subquery(descendants_sq.values("count")), + ) + # Add the parent value qs = qs.annotate(parent_value=F("parent__value")) qs = qs.annotate(_id=F("id")) # ID has an underscore to encourage use of 'value' rather than this internal ID - qs = qs.values("value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id") - qs = qs.order_by("sort_key") + qs = qs.values( # type: ignore[assignment] + "value", "child_count", "descendant_count", "depth", "parent_value", "external_id", "_id" + ) + # lineage is a case-insensitive column storing "Root\tParent\t...\tThisValue\t", so + # ordering by it gives the tree sort order that we want. + qs = qs.order_by("lineage") if include_counts: # Including the counts is a bit tricky; see the comment above in _get_filtered_tags_one_level() obj_tags = ObjectTag.objects.filter(tag_id=models.OuterRef("pk")).order_by().annotate( @@ -632,6 +668,8 @@ def add_tag( # Get parent tag from taxonomy, raises Tag.DoesNotExist if doesn't # belong to taxonomy parent = self.tag_set.get(value__iexact=parent_tag_value) + if parent.depth >= TAXONOMY_MAX_DEPTH: + raise ValueError(f"Cannot create tags more than {TAXONOMY_MAX_DEPTH + 1} levels deep.") tag = Tag.objects.create( taxonomy=self, value=tag_value, parent=parent, external_id=external_id @@ -715,7 +753,7 @@ def validate_value(self, value: str) -> bool: return value != "" and isinstance(value, str) return self.tag_set.filter(value__iexact=value).exists() - def tag_for_value(self, value: str) -> Tag: + def tag_for_value(self, value: str, select_related: list[str] | None = None) -> Tag: """ Get the Tag object for the given value. Some Taxonomies may auto-create the Tag at this point, e.g. a User @@ -726,7 +764,11 @@ def tag_for_value(self, value: str) -> Tag: self.check_casted() if self.allow_free_text: raise ValueError("tag_for_value() doesn't work for free text taxonomies. They don't use Tag instances.") - return self.tag_set.get(value__iexact=value) + if select_related is not None: + qs = self.tag_set.select_related(*select_related) + else: + qs = self.tag_set.all() + return qs.get(value__iexact=value) def validate_external_id(self, external_id: str) -> bool: """ diff --git a/src/openedx_tagging/models/system_defined.py b/src/openedx_tagging/models/system_defined.py index 370af3c5..e709fcae 100644 --- a/src/openedx_tagging/models/system_defined.py +++ b/src/openedx_tagging/models/system_defined.py @@ -4,6 +4,7 @@ from __future__ import annotations import logging +from typing import override from django.conf import settings from django.contrib.auth import get_user_model @@ -96,7 +97,8 @@ def validate_value(self, value: str): except ObjectDoesNotExist: return False - def tag_for_value(self, value: str): + @override + def tag_for_value(self, value: str, select_related: list[str] | None = None) -> Tag: """ Get the Tag object for the given value. """ @@ -114,6 +116,9 @@ def tag_for_value(self, value: str): # We assume the value may change but the external_id is immutable. # So look up keys using external_id. There may be a key with the same external_id but an out of date value. external_id = str(getattr(instance, self.tag_class_key_field)) + tag_set = self.tag_set.all() + if select_related is not None: + tag_set = tag_set.select_related(*select_related) # type: ignore[assignment] tag, _created = self.tag_set.get_or_create(external_id=external_id, defaults={"value": value}) if tag.value != value: # Update the Tag to reflect the new cached 'value' @@ -207,7 +212,8 @@ def validate_value(self, value: str): return True return False - def tag_for_value(self, value: str): + @override + def tag_for_value(self, value: str, select_related: list[str] | None = None) -> Tag: """ Get the Tag object for the given value. """ diff --git a/src/openedx_tagging/models/utils.py b/src/openedx_tagging/models/utils.py index 85167ded..a2c07ce6 100644 --- a/src/openedx_tagging/models/utils.py +++ b/src/openedx_tagging/models/utils.py @@ -1,82 +1,13 @@ """ Utilities for tagging and taxonomy models """ -from django.db import connection as db_connection -from django.db.models import Aggregate, CharField, TextField -from django.db.models.expressions import Combinable, Func RESERVED_TAG_CHARS = [ '\t', # Used in the database to separate tag levels in the "lineage" field - # e.g. lineage="Earth\tNorth America\tMexico\tMexico City" + # e.g. lineage="Earth\tNorth America\tMexico\tMexico City\t" ' > ', # Used in the search index and Instantsearch frontend to separate tag levels # e.g. tags_level3="Earth > North America > Mexico > Mexico City" ';', # Used in CSV exports to separate multiple tags from the same taxonomy # e.g. languages-v1: en;es;fr ] TAGS_CSV_SEPARATOR = RESERVED_TAG_CHARS[2] - - -class ConcatNull(Func): # pylint: disable=abstract-method - """ - Concatenate two arguments together. Like normal SQL but unlike Django's - "Concat", if either argument is NULL, the result will be NULL. - """ - - function = "CONCAT" - - def as_sqlite(self, compiler, connection, **extra_context): - """ SQLite doesn't have CONCAT() but has a concatenation operator """ - return super().as_sql( - compiler, - connection, - template="%(expressions)s", - arg_joiner=" || ", - **extra_context, - ) - - -class StringAgg(Aggregate, Combinable): - """ - Aggregate function that collects the values of some column across all rows, - and creates a string by concatenating those values, with a specified separator. - - This version supports PostgreSQL (STRING_AGG), MySQL (GROUP_CONCAT), and SQLite. - """ - # Default function is for MySQL (GROUP_CONCAT) - function = 'GROUP_CONCAT' - template = '%(function)s(%(distinct)s%(expressions)s)' - - def __init__(self, expression, distinct=False, delimiter=',', **extra): - self.delimiter = delimiter - # Handle the distinct option and output type - distinct_str = 'DISTINCT ' if distinct else '' - - extra.update({ - 'distinct': distinct_str, - 'output_field': CharField(), - }) - - # Check the database backend (PostgreSQL, MySQL, or SQLite) - if 'postgresql' in db_connection.vendor.lower(): - self.function = 'STRING_AGG' - self.template = "%(function)s(%(distinct)s%(expressions)s::TEXT, '%(delimiter)s')" - extra.update({ - "delimiter": self.delimiter, - "output_field": TextField(), - }) - - # Initialize the parent class with the necessary parameters - super().__init__( - expression, - **extra, - ) - - # Implementing abstract methods from Combinable - def __rand__(self, other): - return self._combine(other, 'AND', False) - - def __ror__(self, other): - return self._combine(other, 'OR', False) - - def __rxor__(self, other): - return self._combine(other, 'XOR', False) diff --git a/tests/openedx_tagging/fixtures/tagging.yaml b/tests/openedx_tagging/fixtures/tagging.yaml index 6608ef8f..83f1fba8 100644 --- a/tests/openedx_tagging/fixtures/tagging.yaml +++ b/tests/openedx_tagging/fixtures/tagging.yaml @@ -26,6 +26,8 @@ parent: null value: Bacteria external_id: null + depth: 0 + lineage: "Bacteria\t" - model: oel_tagging.tag pk: 2 fields: @@ -33,6 +35,8 @@ parent: null value: Archaea external_id: null + depth: 0 + lineage: "Archaea\t" - model: oel_tagging.tag pk: 3 fields: @@ -40,6 +44,8 @@ parent: null value: Eukaryota external_id: null + depth: 0 + lineage: "Eukaryota\t" - model: oel_tagging.tag pk: 4 fields: @@ -47,6 +53,8 @@ parent: 1 value: Eubacteria external_id: null + depth: 1 + lineage: "Bacteria\tEubacteria\t" - model: oel_tagging.tag pk: 5 fields: @@ -54,6 +62,8 @@ parent: 1 value: Archaebacteria external_id: null + depth: 1 + lineage: "Bacteria\tArchaebacteria\t" - model: oel_tagging.tag pk: 6 fields: @@ -61,6 +71,8 @@ parent: 2 value: DPANN external_id: null + depth: 1 + lineage: "Archaea\tDPANN\t" - model: oel_tagging.tag pk: 7 fields: @@ -68,6 +80,8 @@ parent: 2 value: Euryarchaeida external_id: null + depth: 1 + lineage: "Archaea\tEuryarchaeida\t" - model: oel_tagging.tag pk: 8 fields: @@ -75,6 +89,8 @@ parent: 2 value: Proteoarchaeota external_id: null + depth: 1 + lineage: "Archaea\tProteoarchaeota\t" - model: oel_tagging.tag pk: 9 fields: @@ -82,6 +98,8 @@ parent: 3 value: Animalia external_id: null + depth: 1 + lineage: "Eukaryota\tAnimalia\t" - model: oel_tagging.tag pk: 10 fields: @@ -89,6 +107,8 @@ parent: 3 value: Plantae external_id: null + depth: 1 + lineage: "Eukaryota\tPlantae\t" - model: oel_tagging.tag pk: 11 fields: @@ -96,6 +116,8 @@ parent: 3 value: Fungi external_id: null + depth: 1 + lineage: "Eukaryota\tFungi\t" - model: oel_tagging.tag pk: 12 fields: @@ -103,6 +125,8 @@ parent: 3 value: Protista external_id: null + depth: 1 + lineage: "Eukaryota\tProtista\t" - model: oel_tagging.tag pk: 13 fields: @@ -110,6 +134,8 @@ parent: 3 value: Monera external_id: null + depth: 1 + lineage: "Eukaryota\tMonera\t" - model: oel_tagging.tag pk: 14 fields: @@ -117,6 +143,8 @@ parent: 9 value: Arthropoda external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tArthropoda\t" - model: oel_tagging.tag pk: 15 fields: @@ -124,6 +152,8 @@ parent: 9 value: Chordata external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tChordata\t" - model: oel_tagging.tag pk: 16 fields: @@ -131,6 +161,8 @@ parent: 9 value: Gastrotrich external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tGastrotrich\t" - model: oel_tagging.tag pk: 17 fields: @@ -138,6 +170,8 @@ parent: 9 value: Cnidaria external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tCnidaria\t" - model: oel_tagging.tag pk: 18 fields: @@ -145,6 +179,8 @@ parent: 9 value: Ctenophora external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tCtenophora\t" - model: oel_tagging.tag pk: 19 fields: @@ -152,6 +188,8 @@ parent: 9 value: Placozoa external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tPlacozoa\t" - model: oel_tagging.tag pk: 20 fields: @@ -159,6 +197,8 @@ parent: 9 value: Porifera external_id: null + depth: 2 + lineage: "Eukaryota\tAnimalia\tPorifera\t" - model: oel_tagging.tag pk: 21 fields: @@ -166,6 +206,8 @@ parent: 15 value: Mammalia external_id: null + depth: 3 + lineage: "Eukaryota\tAnimalia\tChordata\tMammalia\t" - model: oel_tagging.tag pk: 22 fields: @@ -173,6 +215,8 @@ parent: null value: System Tag 1 external_id: 'tag_1' + depth: 0 + lineage: "System Tag 1\t" - model: oel_tagging.tag pk: 23 fields: @@ -180,6 +224,8 @@ parent: null value: System Tag 2 external_id: 'tag_2' + depth: 0 + lineage: "System Tag 2\t" - model: oel_tagging.tag pk: 24 fields: @@ -187,6 +233,8 @@ parent: null value: System Tag 3 external_id: 'tag_3' + depth: 0 + lineage: "System Tag 3\t" - model: oel_tagging.tag pk: 25 fields: @@ -194,13 +242,17 @@ parent: null value: System Tag 4 external_id: 'tag_4' + depth: 0 + lineage: "System Tag 4\t" - model: oel_tagging.tag pk: 26 - fields: + fields: taxonomy: 5 parent: null value: Tag 1 external_id: tag_1 + depth: 0 + lineage: "Tag 1\t" - model: oel_tagging.tag pk: 27 fields: @@ -208,6 +260,8 @@ parent: 26 value: Tag 2 external_id: tag_2 + depth: 1 + lineage: "Tag 1\tTag 2\t" - model: oel_tagging.tag pk: 28 fields: @@ -215,6 +269,8 @@ parent: null value: Tag 3 external_id: tag_3 + depth: 0 + lineage: "Tag 3\t" - model: oel_tagging.tag pk: 29 fields: @@ -222,6 +278,8 @@ parent: 28 value: Tag 4 external_id: tag_4 + depth: 1 + lineage: "Tag 3\tTag 4\t" - model: oel_tagging.taxonomy pk: 1 fields: diff --git a/tests/openedx_tagging/import_export/test_template.py b/tests/openedx_tagging/import_export/test_template.py index 4a45f54b..9b34947e 100644 --- a/tests/openedx_tagging/import_export/test_template.py +++ b/tests/openedx_tagging/import_export/test_template.py @@ -61,9 +61,7 @@ def test_import_template(self, template_file, parser_format): ' Hi-hat (HI-HAT) (Idiophone) (children: 0)', ' Membranophone (DRUMS) (Percussion instruments) (children: 2 + 1)', ' Cajón (CAJÓN) (Membranophone) (children: 1)', - # This tag is present in the import files, but it will be omitted from get_tags() - # because it sits beyond TAXONOMY_MAX_DEPTH. - # Pyle Stringed Jam Cajón (PYLE) (Cajón) (children: 0) + ' Pyle Stringed Jam Cajón (PYLE) (Cajón) (children: 0)', ' Tabla (TABLA) (Membranophone) (children: 0)', 'String instruments (STRINGS) (None) (children: 2 + 5)', ' Bowed strings (BOW) (String instruments) (children: 2)', diff --git a/tests/openedx_tagging/test_api.py b/tests/openedx_tagging/test_api.py index aaa372b4..236447a1 100644 --- a/tests/openedx_tagging/test_api.py +++ b/tests/openedx_tagging/test_api.py @@ -147,7 +147,8 @@ def test_get_tags(self) -> None: "Eukaryota (children: 5 + 8)", " Animalia (children: 7 + 1)", " Arthropoda (children: 0)", - " Chordata (children: 1)", # The child of this is excluded due to depth limit + " Chordata (children: 1)", + " Mammalia (children: 0)", " Cnidaria (children: 0)", " Ctenophora (children: 0)", " Gastrotrich (children: 0)", @@ -784,11 +785,12 @@ def get_object_tags(): "Bacteria (used: 0, children: 2)", " Archaebacteria (used: 1, children: 0)", " Eubacteria (used: 0, children: 0)", - "Eukaryota (used: 0, children: 4 + 7)", - " Animalia (used: 1, children: 7)", + "Eukaryota (used: 0, children: 4 + 8)", + " Animalia (used: 1, children: 7 + 1)", " Arthropoda (used: 1, children: 0)", - " Chordata (used: 0, children: 0)", # <<< Chordata has a matching child but we only support searching - " Cnidaria (used: 0, children: 0)", # 3 levels deep at once for now. + " Chordata (used: 0, children: 1)", + " Mammalia (used: 0, children: 0)", + " Cnidaria (used: 0, children: 0)", " Ctenophora (used: 0, children: 0)", " Gastrotrich (used: 1, children: 0)", " Placozoa (used: 1, children: 0)", @@ -1070,3 +1072,122 @@ def test_unmark_copied_tags(self) -> None: tags = tagging_api.get_object_tags(obj2) assert tags.filter(is_copied=False).count() == 3 assert tags.filter(is_copied=True).count() == 0 + + def test_deep_taxonomy(self) -> None: + """ + Test that a taxonomy's tags can be nested deeply + """ + taxonomy = tagging_api.create_taxonomy(name="Deep Taxonomy") + + tag_0 = tagging_api.add_tag_to_taxonomy(taxonomy, "Root - depth 0") + assert tag_0.depth == 0 + + tag_1 = tagging_api.add_tag_to_taxonomy(taxonomy, "Child - depth 1", parent_tag_value=tag_0.value) + assert tag_1.depth == 1 + + tag_2 = tagging_api.add_tag_to_taxonomy(taxonomy, "Grandchild - depth 2", parent_tag_value=tag_1.value) + assert tag_2.depth == 2 + + # Up to this point is the level at which all the APIs are guaranteed to work correctly. + + tag_3 = tagging_api.add_tag_to_taxonomy(taxonomy, "Great-Grandchild - depth 3", parent_tag_value=tag_2.value) + assert tag_3.depth == 3 + + tag_4 = tagging_api.add_tag_to_taxonomy( + taxonomy, "Great-Great-Grandchild - depth 4", parent_tag_value=tag_3.value + ) + assert tag_4.depth == 4 + + tag_5 = tagging_api.add_tag_to_taxonomy( + taxonomy, "Great-Great-Great-Grandchild - depth 5", parent_tag_value=tag_4.value + ) + assert tag_5.depth == 5 + + assert pretty_format_tags( + tagging_api.get_tags(taxonomy) + ) == [ + "Root - depth 0 (None) (children: 1 + 4)", + " Child - depth 1 (Root - depth 0) (children: 1 + 3)", + " Grandchild - depth 2 (Child - depth 1) (children: 1 + 2)", + " Great-Grandchild - depth 3 (Grandchild - depth 2) (children: 1 + 1)", + " Great-Great-Grandchild - depth 4 (Great-Grandchild - depth 3) (children: 1)", + " Great-Great-Great-Grandchild - depth 5 (Great-Great-Grandchild - depth 4) (children: 0)", + ] + # And we can load deep levels one level at a time: + assert pretty_format_tags( + tagging_api.get_children_tags(taxonomy, parent_tag_value=tag_3.value) + ) == [ + ' Great-Great-Grandchild - depth 4 (Great-Grandchild - depth 3) (children: 1)', + ] + assert pretty_format_tags( + tagging_api.get_children_tags(taxonomy, parent_tag_value=tag_4.value) + ) == [ + ' Great-Great-Great-Grandchild - depth 5 (Great-Great-Grandchild - depth 4) (children: 0)', + ] + # Or even load a subtree: + deep_result = taxonomy.get_filtered_tags(depth=None, parent_tag_value=tag_2.value) + assert pretty_format_tags(deep_result, parent=False) == [ + ' Great-Grandchild - depth 3 (children: 1 + 1)', + ' Great-Great-Grandchild - depth 4 (children: 1)', + ' Great-Great-Great-Grandchild - depth 5 (children: 0)', + ] + # Let's check that all the fields on the last entry, especially 'depth': + assert deep_result[2]["value"] == "Great-Great-Great-Grandchild - depth 5" + assert deep_result[2]["parent_value"] == "Great-Great-Grandchild - depth 4" + assert deep_result[2]["depth"] == 5 + assert deep_result[2]["child_count"] == 0 + + # Also check that Tag.get_lineage() works correctly for super deep tags: + assert tag_5.get_lineage() == [ + 'Root - depth 0', + 'Child - depth 1', + 'Grandchild - depth 2', + 'Great-Grandchild - depth 3', + 'Great-Great-Grandchild - depth 4', + 'Great-Great-Great-Grandchild - depth 5', + ] + + def test_object_tags_deep(self) -> None: + """ + Test that a deep taxonomy's tags can be used to tag objects + """ + taxonomy = tagging_api.create_taxonomy(name="Deep Taxonomy") + tag_0 = tagging_api.add_tag_to_taxonomy(taxonomy, "Bob - depth 0") + tag_1 = tagging_api.add_tag_to_taxonomy(taxonomy, "Janet - depth 1", parent_tag_value=tag_0.value) + tag_2 = tagging_api.add_tag_to_taxonomy(taxonomy, "Alice - depth 2", parent_tag_value=tag_1.value) + tag_3 = tagging_api.add_tag_to_taxonomy(taxonomy, "Fred - depth 3", parent_tag_value=tag_2.value) + tag_4 = tagging_api.add_tag_to_taxonomy(taxonomy, "Patty - depth 4", parent_tag_value=tag_3.value) + tag_5 = tagging_api.add_tag_to_taxonomy(taxonomy, "Clara - depth 5", parent_tag_value=tag_4.value) + assert tag_5.depth == 5 + + object_id = "some object" + tagging_api.tag_object(object_id, taxonomy, tags=[tag_5.value]) + + assert [ot.tag for ot in tagging_api.get_object_tags(object_id)] == [tag_5] + + # Now test the sort order of many tags (the recursive CTE handles depth > 3 correctly): + tagging_api.tag_object(object_id, taxonomy, tags=[ + tag_0.value, tag_1.value, tag_2.value, tag_3.value, tag_4.value, tag_5.value, + ]) + assert [ot.tag for ot in tagging_api.get_object_tags(object_id)] == [ + tag_0, + tag_1, + tag_2, + tag_3, + tag_4, + tag_5, + ] + + def test_depth_limit(self) -> None: + """ + Test that there is a limit to how deeply we can create tags: + """ + taxonomy = tagging_api.create_taxonomy(name="Deep Taxonomy") + tag_0 = tagging_api.add_tag_to_taxonomy(taxonomy, "Bob - depth 0") + tag_1 = tagging_api.add_tag_to_taxonomy(taxonomy, "Janet - depth 1", parent_tag_value=tag_0.value) + tag_2 = tagging_api.add_tag_to_taxonomy(taxonomy, "Alice - depth 2", parent_tag_value=tag_1.value) + tag_3 = tagging_api.add_tag_to_taxonomy(taxonomy, "Fred - depth 3", parent_tag_value=tag_2.value) + tag_4 = tagging_api.add_tag_to_taxonomy(taxonomy, "Clara - depth 4", parent_tag_value=tag_3.value) + tag_4 = tagging_api.add_tag_to_taxonomy(taxonomy, "Patty - depth 5", parent_tag_value=tag_4.value) + with pytest.raises(ValueError, match="Cannot create tags more than 6 levels deep."): + tagging_api.add_tag_to_taxonomy(taxonomy, "Deep - depth 6", parent_tag_value=tag_4.value) diff --git a/tests/openedx_tagging/test_models.py b/tests/openedx_tagging/test_models.py index 67ae68e2..adc60c2f 100644 --- a/tests/openedx_tagging/test_models.py +++ b/tests/openedx_tagging/test_models.py @@ -1,6 +1,7 @@ """ Test the tagging base models """ + from __future__ import annotations import ddt # type: ignore[import] @@ -417,12 +418,12 @@ def test_depth_1_queries(self) -> None: self.test_get_root() with self.assertNumQueries(1): self.test_get_depth_1_search_term() - # When listing the tags below a specific tag, there is one additional query to load each ancestor tag: + # When listing the tags below a specific tag, there is one additional query to load the parent tag: with self.assertNumQueries(2): self.test_get_child_tags_one_level() with self.assertNumQueries(2): self.test_get_depth_1_child_search_term() - with self.assertNumQueries(3): + with self.assertNumQueries(2): self.test_get_grandchild_tags_one_level() ################## @@ -443,7 +444,8 @@ def test_get_all(self) -> None: "Eukaryota (None) (children: 5 + 8)", " Animalia (Eukaryota) (children: 7 + 1)", " Arthropoda (Animalia) (children: 0)", - " Chordata (Animalia) (children: 1)", # note this has a child but the child is not included + " Chordata (Animalia) (children: 1)", + " Mammalia (Chordata) (children: 0)", " Cnidaria (Animalia) (children: 0)", " Ctenophora (Animalia) (children: 0)", " Gastrotrich (Animalia) (children: 0)", @@ -622,6 +624,12 @@ def test_descendant_counts(self) -> None: "Interests (None) (used: 0, children: 1 + 7)", " Holland Codes (Interests) (used: 0, children: 1 + 6)", " Interests - Holland Codes (Holland Codes) (used: 0, children: 6)", + " Artistic (Interests - Holland Codes) (used: 0, children: 0)", + " Conventional (Interests - Holland Codes) (used: 0, children: 0)", + " Enterprising (Interests - Holland Codes) (used: 0, children: 0)", + " Investigative (Interests - Holland Codes) (used: 0, children: 0)", + " Realistic (Interests - Holland Codes) (used: 0, children: 0)", + " Social (Interests - Holland Codes) (used: 0, children: 0)", ] @@ -905,3 +913,234 @@ def test_is_deleted(self): (self.archaea.value, False), (self.bacteria.value, True), # <--- deleted! But the value is preserved. ] + + +class TestTagLineage(TestCase): + """ + Test the Tag.lineage field, which stores the full tab-separated ancestor + path including the tag itself: "Root\tParent\t...\tThisValue\t". + + The tree used throughout this class: + + Charlie (depth 0) + Alice (depth 1) + Delta (depth 2) + Echo (depth 3) + Foxtrot (depth 4) + Bob (depth 1) + Danielle (depth 0) + """ + + def setUp(self): + taxonomy = api.create_taxonomy("Test TagLineage") + self.charlie = Tag.objects.create(taxonomy=taxonomy, value="Charlie") + self.alice = Tag.objects.create(taxonomy=taxonomy, value="Alice", parent=self.charlie) + self.bob = Tag.objects.create(taxonomy=taxonomy, value="Bob", parent=self.charlie) + self.delta = Tag.objects.create(taxonomy=taxonomy, value="Delta", parent=self.alice) + self.echo = Tag.objects.create(taxonomy=taxonomy, value="Echo", parent=self.delta) + self.foxtrot = Tag.objects.create(taxonomy=taxonomy, value="Foxtrot", parent=self.echo) + self.danielle = Tag.objects.create(taxonomy=taxonomy, value="Danielle") + + def test_root_tag(self): + assert self.charlie.lineage == "Charlie\t" + + def test_depth_1(self): + assert self.alice.lineage == "Charlie\tAlice\t" + + def test_depth_2(self): + assert self.delta.lineage == "Charlie\tAlice\tDelta\t" + + def test_depth_3(self): + assert self.echo.lineage == "Charlie\tAlice\tDelta\tEcho\t" + + def test_depth_4(self): + assert self.foxtrot.lineage == "Charlie\tAlice\tDelta\tEcho\tFoxtrot\t" + + def test_second_root(self): + assert self.danielle.lineage == "Danielle\t" + + def test_tree_sort_order(self): + """ + Tags ordered by lineage come out in depth-first tree order: + each parent immediately before its subtree, siblings alphabetically. + Because lineage uses a case-insensitive collation, the sort matches + what the old LOWER(sort_key) CTE produced. + """ + tags = Tag.objects.filter( + pk__in=[ + self.charlie.pk, + self.alice.pk, + self.bob.pk, + self.delta.pk, + self.echo.pk, + self.foxtrot.pk, + self.danielle.pk, + ] + ).order_by("lineage") + # fmt: off + assert [t.value for t in tags] == [ + "Charlie", # Charlie\t + "Alice", # Charlie\tAlice\t + "Delta", # Charlie\tAlice\tDelta\t + "Echo", # Charlie\tAlice\tDelta\tEcho\t + "Foxtrot", # Charlie\tAlice\tDelta\tEcho\tFoxtrot\t + "Bob", # Charlie\tBob\t (after Alice's entire subtree) + "Danielle", # Danielle\t + ] + # fmt: on + + def _refresh_all(self): + """Refresh all tags from the database.""" + self.charlie.refresh_from_db() + self.alice.refresh_from_db() + self.bob.refresh_from_db() + self.delta.refresh_from_db() + self.echo.refresh_from_db() + self.foxtrot.refresh_from_db() + self.danielle.refresh_from_db() + + def test_reparent_to_lower_depth(self): + """ + Moving a tag to a deeper location updates its depth and lineage, + and cascades to all its descendants. + + Before: Charlie -> Alice -> Delta -> Echo -> Foxtrot + After: Charlie -> Bob -> Alice -> Delta -> Echo -> Foxtrot + (Alice moves from depth 1 to depth 2, all descendants shift +1) + """ + self.alice.parent = self.bob + self.alice.save() + self._refresh_all() + + assert self.alice.depth == 2 + assert self.alice.lineage == "Charlie\tBob\tAlice\t" + + assert self.delta.depth == 3 + assert self.delta.lineage == "Charlie\tBob\tAlice\tDelta\t" + + assert self.echo.depth == 4 + assert self.echo.lineage == "Charlie\tBob\tAlice\tDelta\tEcho\t" + + assert self.foxtrot.depth == 5 + assert self.foxtrot.lineage == "Charlie\tBob\tAlice\tDelta\tEcho\tFoxtrot\t" + + # Bob's depth should be unchanged + assert self.bob.depth == 1 + assert self.bob.lineage == "Charlie\tBob\t" + + def test_reparent_to_higher_depth(self): + """ + Moving a tag to a shallower location updates its depth and lineage, + and cascades to all its descendants. + + Before: Charlie -> Alice -> Delta -> Echo -> Foxtrot + After: Charlie -> Delta -> Echo -> Foxtrot + (Delta moves from depth 2 to depth 1, all descendants shift -1) + """ + self.delta.parent = self.charlie + self.delta.save() + self._refresh_all() + + assert self.delta.depth == 1 + assert self.delta.lineage == "Charlie\tDelta\t" + + assert self.echo.depth == 2 + assert self.echo.lineage == "Charlie\tDelta\tEcho\t" + + assert self.foxtrot.depth == 3 + assert self.foxtrot.lineage == "Charlie\tDelta\tEcho\tFoxtrot\t" + + # Alice should be unaffected + assert self.alice.depth == 1 + assert self.alice.lineage == "Charlie\tAlice\t" + + def test_reparent_to_equal_depth(self): + """ + Moving a tag (Delta) to a different parent at the same depth updates its + lineage but leaves depths unchanged. + + Before: Charlie -> Alice -> Delta -> Echo -> Foxtrot + Charlie -> Bob + + After: Charlie -> Alice + Charlie -> Bob -> Delta -> Echo -> Foxtrot + + (Delta moves from Alice to Bob, same depth 2) + """ + self.delta.parent = self.bob + self.delta.save() + self._refresh_all() + + assert self.delta.depth == 2 + assert self.delta.lineage == "Charlie\tBob\tDelta\t" + + assert self.echo.depth == 3 + assert self.echo.lineage == "Charlie\tBob\tDelta\tEcho\t" + + assert self.foxtrot.depth == 4 + assert self.foxtrot.lineage == "Charlie\tBob\tDelta\tEcho\tFoxtrot\t" + + # Alice should be unaffected + assert self.alice.depth == 1 + assert self.alice.lineage == "Charlie\tAlice\t" + + def test_reparent_to_different_root(self): + """ + Moving a tag (Alice) to a parent under a completely different root + updates the full lineage prefix for the tag and all its descendants. + + Before: Charlie -> Alice -> Delta -> Echo -> Foxtrot + Danielle + After: Danielle -> Alice -> Delta -> Echo -> Foxtrot + Charlie + """ + self.alice.parent = self.danielle + self.alice.save() + self._refresh_all() + + assert self.alice.depth == 1 + assert self.alice.lineage == "Danielle\tAlice\t" + + assert self.delta.depth == 2 + assert self.delta.lineage == "Danielle\tAlice\tDelta\t" + + assert self.echo.depth == 3 + assert self.echo.lineage == "Danielle\tAlice\tDelta\tEcho\t" + + assert self.foxtrot.depth == 4 + assert self.foxtrot.lineage == "Danielle\tAlice\tDelta\tEcho\tFoxtrot\t" + + # Charlie is now childless but unchanged + assert self.charlie.depth == 0 + assert self.charlie.lineage == "Charlie\t" + + def test_reparent_to_root(self): + """ + Moving a child tag (Alice) to the root (no parent) updates depth to 0 + and removes all ancestor prefixes from its lineage and those of its + descendants. + + Before: Charlie -> Alice -> Delta -> Echo -> Foxtrot + After: Alice -> Delta -> Echo -> Foxtrot (Alice becomes a root tag) + """ + self.alice.parent = None + self.alice.save() + self._refresh_all() + + assert self.alice.depth == 0 + assert self.alice.lineage == "Alice\t" + + assert self.delta.depth == 1 + assert self.delta.lineage == "Alice\tDelta\t" + + assert self.echo.depth == 2 + assert self.echo.lineage == "Alice\tDelta\tEcho\t" + + assert self.foxtrot.depth == 3 + assert self.foxtrot.lineage == "Alice\tDelta\tEcho\tFoxtrot\t" + + # Charlie and Bob are unaffected + assert self.charlie.depth == 0 + assert self.charlie.lineage == "Charlie\t" + assert self.bob.depth == 1 + assert self.bob.lineage == "Charlie\tBob\t" diff --git a/tests/openedx_tagging/test_views.py b/tests/openedx_tagging/test_views.py index 67b975e0..63de0715 100644 --- a/tests/openedx_tagging/test_views.py +++ b/tests/openedx_tagging/test_views.py @@ -1459,6 +1459,7 @@ def test_small_taxonomy(self): " Animalia (Eukaryota) (children: 7 + 1)", " Arthropoda (Animalia) (children: 0)", " Chordata (Animalia) (children: 1)", + " Mammalia (Chordata) (children: 0)", " Cnidaria (Animalia) (children: 0)", " Ctenophora (Animalia) (children: 0)", " Gastrotrich (Animalia) (children: 0)",