Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 39 additions & 37 deletions src/openedx_tagging/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@

from .data import TagDataQuerySet
from .models import ObjectTag, Tag, Taxonomy
from .models.utils import ConcatNull, StringAgg

# Export this as part of the API
TagDoesNotExist = Tag.DoesNotExist

# Maximum number of tags allowed on any one object
OBJECT_MAX_TAGS = 100


def create_taxonomy( # pylint: disable=too-many-positional-arguments
name: str,
Expand Down Expand Up @@ -198,15 +200,14 @@ def get_object_tags(
base_qs
# Preload related objects, including data for the "get_lineage" method on ObjectTag/Tag:
.select_related("taxonomy", "tag", "tag__parent", "tag__parent__parent")
# Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this:
.annotate(sort_key=Lower(Concat(
ConcatNull(F("tag__parent__parent__parent__value"), Value("\t")),
ConcatNull(F("tag__parent__parent__value"), Value("\t")),
ConcatNull(F("tag__parent__value"), Value("\t")),
Coalesce(F("tag__value"), F("_value")),
Value("\t"),
output_field=models.CharField(),
)))
# Sort the tags within each taxonomy in "tree order". See Taxonomy._get_filtered_tags_deep for details on this.
# tag__lineage is a case-insensitive column storing the full ancestor path, e.g. "Root\tParent\tThis\t".
# Free-text and deleted tags (tag_id IS NULL) fall back to their cached _value.
.annotate(sort_key=Coalesce(
Lower(F("tag__lineage")),
Lower(Concat(F("_value"), Value("\t"))),
output_field=models.TextField(),
))
.annotate(taxonomy_name=Coalesce(F("taxonomy__name"), F("_export_id")))
# Sort first by taxonomy name, then by tag value in tree order:
.order_by("taxonomy_name", "sort_key")
Expand All @@ -222,6 +223,9 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[

Deleted tags and disabled taxonomies are excluded from the counts, even if
ObjectTag data about them is present.

count_implicit: if True, this means to count all ancestor tags (implicit
tags) of the explict tags that are associated with the object.
"""
# Note: in the future we may add an option to exclude system taxonomies from the count.
qs: Any = ObjectTag.objects
Expand All @@ -236,32 +240,30 @@ def get_object_tag_counts(object_id_pattern: str, count_implicit=False) -> dict[
qs = qs.exclude(taxonomy__enabled=False) # The whole taxonomy is disabled
qs = qs.exclude(tag_id=None, taxonomy__allow_free_text=False) # The taxonomy exists but the tag is deleted
if count_implicit:
# Counting the implicit tags is tricky, because if two "grandchild" tags have the same implicit parent tag, we
# need to count that parent tag only once. To do that, we collect all the ancestor tag IDs into an aggregate
# string, and then count the unique values using python
qs = qs.values("object_id").annotate(
num_tags=models.Count("id"),
tag_ids_str_1=StringAgg("tag_id"),
tag_ids_str_2=StringAgg("tag__parent_id"),
tag_ids_str_3=StringAgg("tag__parent__parent_id"),
tag_ids_str_4=StringAgg("tag__parent__parent__parent_id"),
).order_by("object_id")
result = {}
# Use tag__lineage to count implicit (ancestor) tags at any depth.
# Each tag's lineage encodes its full ancestry path, e.g. "root\tchild\tgrandchild\t".
# Every prefix of that path (at each \t boundary) uniquely identifies one tag in the chain,
# so collecting prefixes into a set naturally deduplicates shared ancestors across multiple
# tags on the same object.
qs = qs.annotate(sort_key=F("tag__lineage")).values("object_id", "sort_key")
result: dict = {}
for row in qs:
# ObjectTags for free text taxonomies will be included in "num_tags" count, but not "tag_ids_str_1" since
# they have no tag ID. We can compute how many free text tags each object has now:
if row["tag_ids_str_1"]:
num_free_text_tags = row["num_tags"] - len(row["tag_ids_str_1"].split(","))
object_id = row["object_id"]
if object_id not in result:
result[object_id] = {"free_text": 0, "paths": set()}
sort_key = row["sort_key"]
if sort_key is None:
# Free-text tag: no Tag record, so no sort_key
result[object_id]["free_text"] += 1
else:
num_free_text_tags = row["num_tags"]
# Then we count the total number of *unique* Tags for this object, both implicit and explicit:
other_tag_ids = set()
for field in ("tag_ids_str_1", "tag_ids_str_2", "tag_ids_str_3", "tag_ids_str_4"):
if row[field] is not None:
for tag_id in row[field].split(","):
other_tag_ids.add(int(tag_id))
result[row["object_id"]] = num_free_text_tags + len(other_tag_ids)
return result
# Add the sort_key prefix for each ancestor level
parts = sort_key.rstrip("\t").split("\t")
for i in range(1, len(parts) + 1):
result[object_id]["paths"].add("\t".join(parts[:i]))
return {
object_id: data["free_text"] + len(data["paths"])
for object_id, data in result.items()
}
else:
qs = qs.values("object_id").annotate(num_tags=models.Count("id")).order_by("object_id")
return {row["object_id"]: row["num_tags"] for row in qs}
Expand All @@ -283,17 +285,17 @@ def _check_new_tag_count(
taxonomy_export_id: str | None = None,
) -> None:
"""
Checks if the new count of tags for the object is equal or less than 100
Checks if the new count of tags for the object is equal or less than OBJECT_MAX_TAGS
"""
# Exclude to avoid counting the tags that are going to be updated
if taxonomy:
current_count = ObjectTag.objects.filter(object_id=object_id).exclude(taxonomy_id=taxonomy.id).count()
else:
current_count = ObjectTag.objects.filter(object_id=object_id).exclude(_export_id=taxonomy_export_id).count()

if current_count + new_tag_count > 100:
if current_count + new_tag_count > OBJECT_MAX_TAGS:
raise ValueError(
_("Cannot add more than 100 tags to ({object_id}).").format(object_id=object_id)
_("Cannot add more than {limit} tags to ({object_id}).").format(object_id=object_id, limit=OBJECT_MAX_TAGS)
)


Expand Down
163 changes: 163 additions & 0 deletions src/openedx_tagging/migrations/0020_tag_depth_and_lineage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
(1) Add a concrete 'depth' column to the oel_tagging_tag table.

The depth column stores:
- 0 for root tags (parent IS NULL)
- parent.depth + 1 for all other tags

A CHECK constraint enforces this invariant at the database level:
parent_id IS NULL OR depth > 0

(2) Add a concrete 'lineage' column to the oel_tagging_tag table.

The lineage column stores the full tab-separated ancestor path including the
tag itself:

"RootValue\\tParentValue\\t...\\tThisValue\\t"

with original casing and a trailing tab delimiter. Because the column uses a
case-insensitive collation, ORDER BY lineage gives the depth-first tree order
that we want when querying the taxonomy tree.

The trailing tab makes prefix matching unambiguous: every descendant of tag T
has a lineage that starts with T.lineage (since T.lineage ends with '\\t' and
no tag value can contain '\\t').
"""

from django.db import migrations, models
from django.db.models.functions import Concat, Length, Replace

import openedx_django_lib.fields


def populate_depth_and_lineage(apps, _schema_editor):
"""
Populate the new `depth` and `lineage` columns for all existing tags by
walking the hierarchy one level at a time (root tags first, then their
children, etc.).
"""
Tag = apps.get_model("oel_tagging", "Tag")
# Root tags: depth 0, lineage = "value\t"
for tag in Tag.objects.filter(parent__isnull=True).only("id", "value"):
Tag.objects.filter(pk=tag.pk).update(depth=0, lineage=tag.value + "\t")
# Walk down the tree one level at a time.
for level in range(1, 20): # Depth should be at most 3 or 4, but it doesn't hurt to be thorough.
children = list(
Tag.objects.filter(parent__depth=level - 1).select_related("parent").only("id", "value", "parent__lineage")
)
if not children:
break
for tag in children:
Tag.objects.filter(pk=tag.pk).update(
depth=level,
lineage=tag.parent.lineage + tag.value + "\t",
)


def reverse_populate_depth_and_lineage(_apps, _schema_editor):
pass # Both fields are dropped on reverse, so no cleanup needed.


def _create_lineage_index(_apps, schema_editor):
"""
Create an index on the lineage column.

MySQL's InnoDB limits index key length to 3072 bytes; with utf8mb4 (up to
4 bytes per character) a full-column index on a VARCHAR(3006) would require
up to 12,024 bytes — far over the limit. We therefore use a 768-character
prefix on MySQL (768 × 4 = 3072 bytes, exactly at the limit) and a regular
full-column index on SQLite and PostgreSQL.
"""
if schema_editor.connection.vendor == "mysql":
schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage(768))")
else:
schema_editor.execute("CREATE INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag (lineage)")


def _drop_lineage_index(_apps, schema_editor):
if schema_editor.connection.vendor == "mysql":
schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx ON oel_tagging_tag")
else:
schema_editor.execute("DROP INDEX oel_tagging_lineage_d65f82_idx")


class Migration(migrations.Migration):
"""Add depth and lineage columns to Tag; remove the oel_tagging_tag_computed view."""

# Even though this migration no longer creates a view, we keep atomic=False
# as a safety measure since this migration touched DDL on MySQL in its prior form.
atomic = False

dependencies = [
("oel_tagging", "0019_language_taxonomy_class"),
]

operations = [
# 1. Add the depth column to oel_tagging_tag with a safe default of 0.
migrations.AddField(
model_name="tag",
name="depth",
field=models.IntegerField(
default=0,
help_text="Number of ancestors this tag has. Zero for root tags, one for their children, and so on. Set automatically by save(); do not set manually.",
),
),
# 2. Add the lineage column with an empty default (populated below).
migrations.AddField(
model_name="tag",
name="lineage",
field=openedx_django_lib.fields.MultiCollationCharField(
db_collations={"mysql": "utf8mb4_unicode_ci", "sqlite": "NOCASE"},
default="",
help_text="Tab-separated ancestor path including this tag: 'Root\\tParent\\t...\\tThisValue\\t'. Used for depth-first tree ordering and descendant prefix matching. Set automatically by save(); do not set manually.",
max_length=3006,
),
),
# 3. Populate depth and lineage for all pre-existing tags.
migrations.RunPython(populate_depth_and_lineage, reverse_populate_depth_and_lineage, elidable=False),
# 4. Add CHECK constraints, once we've populated the values.
migrations.AddConstraint(
model_name="tag",
constraint=models.CheckConstraint(
condition=(models.Q(parent_id__isnull=True) | models.Q(depth__gt=0)),
name="oel_tagging_tag_depth_parent_check",
),
),
migrations.AddConstraint(
model_name="tag",
constraint=models.CheckConstraint(
condition=models.Q(lineage__endswith=Concat(models.F("value"), models.Value("\t"))),
name="oel_tagging_tag_lineage_ends_with_value",
),
),
migrations.AddConstraint(
model_name="tag",
constraint=models.CheckConstraint(
condition=models.Q(
depth=(
Length(models.F("lineage"))
- Length(Replace(models.F("lineage"), models.Value("\t"), models.Value("")))
- 1
)
),
name="oel_tagging_tag_lineage_tab_count_check",
),
),
# 5. Add index on lineage after data is populated, so the build scans real values.
# MySQL's InnoDB limits index keys to 3072 bytes; with utf8mb4 (4 bytes/char) that
# caps a full-column index at 768 chars — far shorter than max_length=3006. We use
# SeparateDatabaseAndState so Django's migration state records the index normally
# (avoiding spurious makemigrations noise) while the actual SQL uses a 768-char
# prefix on MySQL and a regular full-column index everywhere else.
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddIndex(
model_name="tag",
index=models.Index(fields=["lineage"], name="oel_tagging_lineage_d65f82_idx"),
),
],
database_operations=[
migrations.RunPython(_create_lineage_index, _drop_lineage_index, elidable=False),
],
),
]
Loading