From 8d486128ec9be06b265b6c8124f7b8b839540662 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 15:58:26 +0100 Subject: [PATCH 01/10] Prebuild and cache inline image XML Avoid calling python-docx per-image by generating a CT_Inline-based XML template once and using str.format() to fill sentinels (keeping compatibility with installed python-docx). Add caching of generated image XML per (part, descriptor, width, height) to skip repeated I/O, SHA1 work and header parsing. Use package.get_or_add_image_part and relate_to with RT.IMAGE, compute scaled_dimensions, assign shape_id from docx_ids_index, and xml-escape filenames. Also add a _image_cache dict on DocxTemplate and adjust hyperlink handling to use the local part variable. --- docxtpl/inline_image.py | 98 ++++++++++++++++++++++++++++++++++++++--- docxtpl/template.py | 1 + 2 files changed, 93 insertions(+), 6 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index f860749..781976b 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -4,8 +4,62 @@ @author: Eric Lapouyade """ +from xml.sax.saxutils import escape as xml_escape + +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml import OxmlElement, parse_xml from docx.oxml.ns import qn +from docx.oxml.shape import CT_Inline +from docx.shared import Emu + + +def _build_inline_image_xml_template(): + """Generate the XML format string by calling python-docx with sentinel values. + + This ensures the template always matches the installed python-docx version's + XML structure, even after upgrades. We call CT_Inline.new_pic_inline() once + with recognizable sentinel values, serialize to XML, then replace the + sentinels with Python format placeholders. + """ + import uuid + + # Use GUIDs for string sentinels - guaranteed no collision with XML content + _RID_SENTINEL = str(uuid.uuid4()) + _FILENAME_SENTINEL = str(uuid.uuid4()) + + # For numeric sentinels, use unique integers derived from UUIDs. + # shape_id is xsd:unsignedInt (max 4,294,967,295 / 32-bit). + # cx/cy are EMU values typed as xsd:long (64-bit). + # All use 9-digit range [100000000, 999999999] to stay within 32-bit + # and avoid any accidental collisions with each other. + _SHAPE_ID = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CX_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CY_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + + inline = CT_Inline.new_pic_inline( + _SHAPE_ID, + _RID_SENTINEL, + _FILENAME_SENTINEL, + Emu(_CX_INT), + Emu(_CY_INT), + ) + xml = inline.xml + + # Replace sentinel values with format string placeholders + xml = xml.replace(str(_SHAPE_ID), "{shape_id}") + xml = xml.replace(_RID_SENTINEL, "{rId}") + xml = xml.replace(_FILENAME_SENTINEL, "{filename}") + xml = xml.replace(str(_CX_INT), "{cx}") + xml = xml.replace(str(_CY_INT), "{cy}") + + return xml + + +# Pre-built XML template for inline images, derived from the installed +# python-docx version. Using str.format() on this template avoids calling +# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() + +# element manipulation + .xml serialization each time). +_INLINE_IMAGE_XML = _build_inline_image_xml_template() class InlineImage(object): @@ -50,16 +104,48 @@ def _add_hyperlink(self, run, url, part): return run def _insert_image(self): - pic = self.tpl.current_rendering_part.new_pic_inline( - self.image_descriptor, - self.width, - self.height, - ).xml + part = self.tpl.current_rendering_part + image_descriptor = self.image_descriptor + + # Cache generated XML per (part, descriptor, width, height) to avoid + # repeated file I/O, SHA1 computation, and header parsing. + cache = self.tpl._image_cache + cache_key = (id(part), image_descriptor, self.width, self.height) + + if cache_key in cache: + pic = cache[cache_key] + else: + # Get or add the image part (handles deduplication via SHA1 internally) + package = part._package + image_part = package.get_or_add_image_part(image_descriptor) + rId = part.relate_to(image_part, RT.IMAGE) + image = image_part.image + cx, cy = image.scaled_dimensions(self.width, self.height) + + # Assign shape_id from a simple counter. python-docx's + # new_pic_inline() would call its next_id property which does an + # XPath("//@id") over the entire XML tree on every call - but we + # bypass that entirely by generating the XML ourselves. + # fix_docpr_ids() renumbers all IDs after rendering anyway. + self.tpl.docx_ids_index += 1 + shape_id = self.tpl.docx_ids_index + + # Generate XML directly as a string using a pre-built template + # rather than calling CT_Inline.new_pic_inline() per image. + pic = _INLINE_IMAGE_XML.format( + cx=int(cx), + cy=int(cy), + shape_id=shape_id, + filename=xml_escape(image.filename), + rId=rId, + ) + cache[cache_key] = pic + if self.anchor: run = parse_xml(pic) if run.xpath(".//a:blip"): hyperlink = self._add_hyperlink( - run, self.anchor, self.tpl.current_rendering_part + run, self.anchor, part ) pic = hyperlink.xml diff --git a/docxtpl/template.py b/docxtpl/template.py index abcff49..a0d325b 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -171,6 +171,7 @@ def render_init(self): self.pic_map = {} self.current_rendering_part = None self.docx_ids_index = 1000 + self._image_cache = {} self.is_saved = False def __getattr__(self, name): From ddf1687f9dbf592199e4aff6cc52e541455d3616 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 16:25:48 +0100 Subject: [PATCH 02/10] Optimize image part deduplication Add an O(1) SHA1 index for image parts and a fast _get_or_add_image_part helper on DocxTemplate to avoid python-docx's O(n) linear scan and repeated SHA1 recomputation. Initialize the index in the constructor (_init_image_parts_index), seed it from existing image parts, and maintain a sequential partname counter to prevent partname collisions. Update InlineImage to call tpl._get_or_add_image_part (which returns (image_part, image)) instead of package.get_or_add_image_part, and use the returned Image object. This improves performance and reduces redundant SHA1 work when inserting/looking up images. --- docxtpl/inline_image.py | 7 +++--- docxtpl/template.py | 55 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 781976b..7d353ad 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -115,11 +115,10 @@ def _insert_image(self): if cache_key in cache: pic = cache[cache_key] else: - # Get or add the image part (handles deduplication via SHA1 internally) - package = part._package - image_part = package.get_or_add_image_part(image_descriptor) + # Get or add the image part with O(1) SHA1 deduplication, + # avoiding the O(n) linear scan and SHA1 recomputation per lookup. + image_part, image = self.tpl._get_or_add_image_part(image_descriptor) rId = part.relate_to(image_part, RT.IMAGE) - image = image_part.image cx, cy = image.scaled_dimensions(self.width, self.height) # Assign shape_id from a simple counter. python-docx's diff --git a/docxtpl/template.py b/docxtpl/template.py index a0d325b..69eb2f7 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -173,6 +173,61 @@ def render_init(self): self.docx_ids_index = 1000 self._image_cache = {} self.is_saved = False + self._init_image_parts_index() + + def _init_image_parts_index(self): + """Build an O(1) SHA1 index of existing image parts in the package. + + This enables fast deduplication in _get_or_add_image_part(), avoiding + the O(n) linear scan and repeated SHA1 recomputation that occurs in + the default python-docx image-part lookup. + """ + package = self.docx._part._package + image_parts = package.image_parts + + # Seed the index from existing image parts in the template. + # ImagePart.sha1 recomputes on each access, but this is a one-time + # cost for the (typically few) images already in the template. + self._image_sha1_index = {} + for ip in image_parts: + self._image_sha1_index[ip.sha1] = ip + + # Start the partname counter after all existing image parts to avoid + # collisions with partnames already in the package. + self._image_part_counter = len(image_parts._image_parts) + + def _get_or_add_image_part(self, image_descriptor): + """Return (image_part, image) for the given image_descriptor. + + Performs the same function as python-docx's + Package.get_or_add_image_part() but with O(1) SHA1 deduplication + (instead of O(n) linear scan with repeated SHA1 recomputation) and + sequential partname assignment (instead of O(n²) gap-search). + """ + from docx.image.image import Image + from docx.opc.packuri import PackURI + from docx.parts.image import ImagePart + + image = Image.from_file(image_descriptor) + sha1 = image.sha1 # @lazyproperty — computed once per Image object + + image_part = self._image_sha1_index.get(sha1) + if image_part is not None: + return image_part, image + + # New unique image — create part with sequential partname + self._image_part_counter += 1 + partname = PackURI( + "/word/media/image%d.%s" % (self._image_part_counter, image.ext) + ) + image_part = ImagePart.from_image(image, partname) + + # Add to the package collection and the SHA1 index + package = self.docx._part._package + package.image_parts.append(image_part) + self._image_sha1_index[sha1] = image_part + + return image_part, image def __getattr__(self, name): return getattr(self.docx, name) From 4a96bc4b5b505812a15736538b98e9cf140299e0 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 16:51:28 +0100 Subject: [PATCH 03/10] Use descriptor cache for image deduplication Replace the SHA1-based image-part index with a descriptor-keyed cache (_image_descriptor_index) to deduplicate images by file-path (O(1)) and avoid expensive SHA1 hashing. For string path descriptors the cache is used to return existing (image_part, image) tuples; non-string descriptors (e.g. file-like objects) fall back to always creating a new part. Keeps sequential partname assignment and appends new ImagePart to the package; caches the result for string descriptors. This improves performance when adding many images (e.g. large photos) by eliminating repeated SHA1 computation. --- docxtpl/inline_image.py | 6 +++--- docxtpl/template.py | 47 +++++++++++++++++++++-------------------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 7d353ad..3c69168 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -108,15 +108,15 @@ def _insert_image(self): image_descriptor = self.image_descriptor # Cache generated XML per (part, descriptor, width, height) to avoid - # repeated file I/O, SHA1 computation, and header parsing. + # repeated file I/O, image hashing, and header parsing. cache = self.tpl._image_cache cache_key = (id(part), image_descriptor, self.width, self.height) if cache_key in cache: pic = cache[cache_key] else: - # Get or add the image part with O(1) SHA1 deduplication, - # avoiding the O(n) linear scan and SHA1 recomputation per lookup. + # Get or add the image part with O(1) descriptor-based dedup, + # avoiding the O(n) linear scan in python-docx's default path. image_part, image = self.tpl._get_or_add_image_part(image_descriptor) rId = part.relate_to(image_part, RT.IMAGE) cx, cy = image.scaled_dimensions(self.width, self.height) diff --git a/docxtpl/template.py b/docxtpl/template.py index 69eb2f7..9e9faaf 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -176,21 +176,17 @@ def render_init(self): self._init_image_parts_index() def _init_image_parts_index(self): - """Build an O(1) SHA1 index of existing image parts in the package. + """Initialize image-part tracking for fast insertion. - This enables fast deduplication in _get_or_add_image_part(), avoiding - the O(n) linear scan and repeated SHA1 recomputation that occurs in - the default python-docx image-part lookup. + Uses a descriptor-keyed cache (file path string) for O(1) dedup of + images added during rendering, avoiding expensive content hashing. """ package = self.docx._part._package image_parts = package.image_parts - # Seed the index from existing image parts in the template. - # ImagePart.sha1 recomputes on each access, but this is a one-time - # cost for the (typically few) images already in the template. - self._image_sha1_index = {} - for ip in image_parts: - self._image_sha1_index[ip.sha1] = ip + # Descriptor-keyed cache: maps image_descriptor -> (image_part, image) + # This is the primary dedup mechanism and avoids expensive content hashing. + self._image_descriptor_index = {} # Start the partname counter after all existing image parts to avoid # collisions with partnames already in the package. @@ -199,35 +195,40 @@ def _init_image_parts_index(self): def _get_or_add_image_part(self, image_descriptor): """Return (image_part, image) for the given image_descriptor. - Performs the same function as python-docx's - Package.get_or_add_image_part() but with O(1) SHA1 deduplication - (instead of O(n) linear scan with repeated SHA1 recomputation) and - sequential partname assignment (instead of O(n²) gap-search). + Uses the descriptor itself (file path) as the dedup key, avoiding + expensive content hashing. Falls back to always creating a new part + for non-hashable descriptors (file-like objects). """ from docx.image.image import Image from docx.opc.packuri import PackURI from docx.parts.image import ImagePart - image = Image.from_file(image_descriptor) - sha1 = image.sha1 # @lazyproperty — computed once per Image object + # For string paths, use the path as a cheap dedup key. + cache_key = image_descriptor if isinstance(image_descriptor, str) else None + + if cache_key is not None: + cached = self._image_descriptor_index.get(cache_key) + if cached is not None: + return cached - image_part = self._image_sha1_index.get(sha1) - if image_part is not None: - return image_part, image + image = Image.from_file(image_descriptor) - # New unique image — create part with sequential partname + # Create image part with sequential partname self._image_part_counter += 1 partname = PackURI( "/word/media/image%d.%s" % (self._image_part_counter, image.ext) ) image_part = ImagePart.from_image(image, partname) - # Add to the package collection and the SHA1 index + # Add to the package collection package = self.docx._part._package package.image_parts.append(image_part) - self._image_sha1_index[sha1] = image_part - return image_part, image + result = (image_part, image) + if cache_key is not None: + self._image_descriptor_index[cache_key] = result + + return result def __getattr__(self, name): return getattr(self.docx, name) From 98d8aba7b63b2f20be808d1017701eaf2665a324 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 17:39:45 +0100 Subject: [PATCH 04/10] Cache image metadata instead of XML Cache only the expensive image metadata (rId, dimensions, filename) per (part, descriptor, width, height) instead of the full inline XML. A fresh shape_id is now assigned for every insertion so drawing IDs remain unique (important for headers/footers/footnotes which aren't renumbered by fix_docpr_ids()). This preserves performance benefits (avoids repeated image part lookup, hashing and header parsing) while preventing duplicate drawing IDs; cx/cy are stored as ints and filename is xml-escaped when cached. --- docxtpl/inline_image.py | 45 +++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 3c69168..10441f0 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -107,38 +107,39 @@ def _insert_image(self): part = self.tpl.current_rendering_part image_descriptor = self.image_descriptor - # Cache generated XML per (part, descriptor, width, height) to avoid - # repeated file I/O, image hashing, and header parsing. + # Cache the expensive parts (image part lookup, rId, dimensions) per + # (part, descriptor, width, height). The XML string itself is NOT + # cached because each insertion needs a unique shape_id - header/footer + # and footnote parts are not renumbered by fix_docpr_ids(). cache = self.tpl._image_cache cache_key = (id(part), image_descriptor, self.width, self.height) if cache_key in cache: - pic = cache[cache_key] + rId, cx, cy, filename = cache[cache_key] else: # Get or add the image part with O(1) descriptor-based dedup, # avoiding the O(n) linear scan in python-docx's default path. image_part, image = self.tpl._get_or_add_image_part(image_descriptor) rId = part.relate_to(image_part, RT.IMAGE) cx, cy = image.scaled_dimensions(self.width, self.height) - - # Assign shape_id from a simple counter. python-docx's - # new_pic_inline() would call its next_id property which does an - # XPath("//@id") over the entire XML tree on every call - but we - # bypass that entirely by generating the XML ourselves. - # fix_docpr_ids() renumbers all IDs after rendering anyway. - self.tpl.docx_ids_index += 1 - shape_id = self.tpl.docx_ids_index - - # Generate XML directly as a string using a pre-built template - # rather than calling CT_Inline.new_pic_inline() per image. - pic = _INLINE_IMAGE_XML.format( - cx=int(cx), - cy=int(cy), - shape_id=shape_id, - filename=xml_escape(image.filename), - rId=rId, - ) - cache[cache_key] = pic + filename = xml_escape(image.filename) + cache[cache_key] = (rId, int(cx), int(cy), filename) + + # Always assign a fresh shape_id per insertion so that drawing IDs + # are unique in every part (including headers/footers/footnotes + # which are not renumbered by fix_docpr_ids()). + self.tpl.docx_ids_index += 1 + shape_id = self.tpl.docx_ids_index + + # Generate XML directly as a string using a pre-built template + # rather than calling CT_Inline.new_pic_inline() per image. + pic = _INLINE_IMAGE_XML.format( + cx=int(cx), + cy=int(cy), + shape_id=shape_id, + filename=filename, + rId=rId, + ) if self.anchor: run = parse_xml(pic) From e4886535593541d6ee86443d28334dee12dd11a4 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 17:46:16 +0100 Subject: [PATCH 05/10] Handle non-hashable descriptors; escape quotes Use id() for non-hashable image descriptors (e.g. file-like objects) when building the image cache key to avoid TypeError on dict lookup. Also escape double quotes in image filenames for XML attribute usage by passing a mapping to xml_escape so quotes become ". Cache semantics and per-insertion shape_id assignment are otherwise unchanged. --- docxtpl/inline_image.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 10441f0..3a207be 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -112,7 +112,10 @@ def _insert_image(self): # cached because each insertion needs a unique shape_id - header/footer # and footnote parts are not renumbered by fix_docpr_ids(). cache = self.tpl._image_cache - cache_key = (id(part), image_descriptor, self.width, self.height) + # Use id() for non-hashable descriptors (file-like objects) to avoid + # TypeError on dict lookup. + desc_key = image_descriptor if isinstance(image_descriptor, str) else id(image_descriptor) + cache_key = (id(part), desc_key, self.width, self.height) if cache_key in cache: rId, cx, cy, filename = cache[cache_key] @@ -122,7 +125,8 @@ def _insert_image(self): image_part, image = self.tpl._get_or_add_image_part(image_descriptor) rId = part.relate_to(image_part, RT.IMAGE) cx, cy = image.scaled_dimensions(self.width, self.height) - filename = xml_escape(image.filename) + # Escape for use inside XML attribute (quotes must be escaped) + filename = xml_escape(image.filename, {'"': """}) cache[cache_key] = (rId, int(cx), int(cy), filename) # Always assign a fresh shape_id per insertion so that drawing IDs From 7c52c563f74e6ed7ed631213d6b0d126cffeecd3 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 17:49:15 +0100 Subject: [PATCH 06/10] Scan image partnames to derive counter Avoid using len() of image parts to pick the next image partname index, which could collide when numbering is non-contiguous. Instead scan existing image partnames (using partname.baseURI when available, otherwise str(partname)), extract numeric suffixes with a regex (/image(\d+)\.), track the maximum index, and set the image part counter to that max. This ensures new image partnames won't reuse an already-present index. --- docxtpl/template.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index 9e9faaf..c63a2d3 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -188,9 +188,19 @@ def _init_image_parts_index(self): # This is the primary dedup mechanism and avoids expensive content hashing. self._image_descriptor_index = {} - # Start the partname counter after all existing image parts to avoid - # collisions with partnames already in the package. - self._image_part_counter = len(image_parts._image_parts) + # Derive the next partname index by scanning existing partnames once. + # Using len() alone would collide with non-contiguous numbering + # (e.g. image1.png + image3.png → len=2 → next would be image3.ext). + max_index = 0 + for ip in image_parts: + # Partnames follow /word/media/imageN.ext pattern + name = ip.partname.baseURI if hasattr(ip.partname, 'baseURI') else str(ip.partname) + m = re.search(r'/image(\d+)\.', name) + if m: + idx = int(m.group(1)) + if idx > max_index: + max_index = idx + self._image_part_counter = max_index def _get_or_add_image_part(self, image_descriptor): """Return (image_part, image) for the given image_descriptor. From 7581a333ec77046ab7e87bffeac0e78defd9b82d Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 17:55:58 +0100 Subject: [PATCH 07/10] Always use str(partname) for image parts Replace conditional use of partname.baseURI with a direct str(partname) conversion when iterating image parts. This makes the code rely on a consistent string representation for part names (used by the /imageN.ext regex) and avoids depending on the presence of a baseURI attribute across different part implementations. --- docxtpl/template.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index c63a2d3..078d172 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -194,7 +194,7 @@ def _init_image_parts_index(self): max_index = 0 for ip in image_parts: # Partnames follow /word/media/imageN.ext pattern - name = ip.partname.baseURI if hasattr(ip.partname, 'baseURI') else str(ip.partname) + name = str(ip.partname) m = re.search(r'/image(\d+)\.', name) if m: idx = int(m.group(1)) From 82fd69c73314c005654a84998cd802964d0c1f8d Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 17:59:06 +0100 Subject: [PATCH 08/10] Initialize docx_ids_index from existing docPr ids Replace the hardcoded docx_ids_index initialization with a routine that scans all package parts (body, headers, footers, footnotes) for wp:docPr elements and sets the counter above the maximum found id (minimum 1000). This prevents id collisions when inserting new drawings into parts that were not renumbered by fix_docpr_ids. The new method is called during initialization and safely skips non-XML or unreadable parts. --- docxtpl/template.py | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/docxtpl/template.py b/docxtpl/template.py index 078d172..cf339df 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -170,10 +170,48 @@ def render_init(self): self.init_docx() self.pic_map = {} self.current_rendering_part = None - self.docx_ids_index = 1000 self._image_cache = {} self.is_saved = False self._init_image_parts_index() + self._init_docx_ids_index() + + def _init_docx_ids_index(self): + """Set docx_ids_index above the maximum existing wp:docPr id. + + fix_docpr_ids() only renumbers the body tree, so IDs in headers, + footers, and footnotes retain their original values. Starting the + counter above the global maximum prevents collisions when inserting + new drawings into any part. + """ + import docx.oxml.ns as _ns + wp_ns = _ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + max_id = 0 + + # Scan all parts (body + headers + footers + footnotes) + for part in self.docx._part._package.parts: + if not hasattr(part, 'blob') or part.blob is None: + continue + # Only scan XML parts that could contain drawings + ct = getattr(part, 'content_type', '') + if not ct.startswith('application/vnd.openxmlformats-officedocument'): + continue + try: + tree = etree.fromstring(part.blob) + except Exception: + continue + for elt in tree.iter(tag): + id_val = elt.get('id') + if id_val is not None: + try: + val = int(id_val) + if val > max_id: + max_id = val + except ValueError: + pass + + # Start above the highest existing ID (minimum 1000 for safety) + self.docx_ids_index = max(max_id, 1000) def _init_image_parts_index(self): """Initialize image-part tracking for fast insertion. From ef56632b1938690db98ee9b5cf2c2fe7a7eb34e4 Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 18:05:16 +0100 Subject: [PATCH 09/10] Normalize None image filename before escaping Treat image.filename == None (e.g., BytesIO/file-like descriptors) as an empty string before calling xml_escape so XML attribute generation matches python-docx behavior. Added a clarifying comment and ensure the escaped filename is stored in the cache to avoid None-related issues when rendering. --- docxtpl/inline_image.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 3a207be..684b5ce 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -125,8 +125,10 @@ def _insert_image(self): image_part, image = self.tpl._get_or_add_image_part(image_descriptor) rId = part.relate_to(image_part, RT.IMAGE) cx, cy = image.scaled_dimensions(self.width, self.height) - # Escape for use inside XML attribute (quotes must be escaped) - filename = xml_escape(image.filename, {'"': """}) + # Escape for use inside XML attribute (quotes must be escaped). + # image.filename is None for file-like descriptors (BytesIO); + # normalize to empty string to match python-docx's behavior. + filename = xml_escape(image.filename or "", {'"': """}) cache[cache_key] = (rId, int(cx), int(cy), filename) # Always assign a fresh shape_id per insertion so that drawing IDs From f316ca8a4b944ce83ea96cdb6990559a71d23f8d Mon Sep 17 00:00:00 2001 From: Jack Byrne <46843566+JackByrne@users.noreply.github.com> Date: Mon, 18 May 2026 18:15:13 +0100 Subject: [PATCH 10/10] Skip caching unhashable image descriptors Only build and use a cache key when the image_descriptor is hashable. Previously id() was used for non-hashable descriptors (e.g. file-like objects), which could risk aliasing after GC and lead to incorrect deduplication. Now the code attempts to construct a cache key with the descriptor and falls back to skipping caching for unhashable descriptors; cache entries are only read/written when a valid cache_key exists. Filename normalization and per-insertion shape_id behavior are unchanged. --- docxtpl/inline_image.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index 684b5ce..da35bbd 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -112,12 +112,16 @@ def _insert_image(self): # cached because each insertion needs a unique shape_id - header/footer # and footnote parts are not renumbered by fix_docpr_ids(). cache = self.tpl._image_cache - # Use id() for non-hashable descriptors (file-like objects) to avoid - # TypeError on dict lookup. - desc_key = image_descriptor if isinstance(image_descriptor, str) else id(image_descriptor) - cache_key = (id(part), desc_key, self.width, self.height) - - if cache_key in cache: + # For hashable descriptors (strings, paths), cache by value. + # For unhashable descriptors (file-like objects), skip caching + # entirely — using id() would risk aliasing after GC. + try: + cache_key = (id(part), image_descriptor, self.width, self.height) + hash(cache_key) is not None # trigger TypeError if unhashable + except TypeError: + cache_key = None + + if cache_key is not None and cache_key in cache: rId, cx, cy, filename = cache[cache_key] else: # Get or add the image part with O(1) descriptor-based dedup, @@ -129,7 +133,8 @@ def _insert_image(self): # image.filename is None for file-like descriptors (BytesIO); # normalize to empty string to match python-docx's behavior. filename = xml_escape(image.filename or "", {'"': """}) - cache[cache_key] = (rId, int(cx), int(cy), filename) + if cache_key is not None: + cache[cache_key] = (rId, int(cx), int(cy), filename) # Always assign a fresh shape_id per insertion so that drawing IDs # are unique in every part (including headers/footers/footnotes