diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index f860749..da35bbd 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -4,8 +4,62 @@ @author: Eric Lapouyade """ +from xml.sax.saxutils import escape as xml_escape + +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml import OxmlElement, parse_xml from docx.oxml.ns import qn +from docx.oxml.shape import CT_Inline +from docx.shared import Emu + + +def _build_inline_image_xml_template(): + """Generate the XML format string by calling python-docx with sentinel values. + + This ensures the template always matches the installed python-docx version's + XML structure, even after upgrades. We call CT_Inline.new_pic_inline() once + with recognizable sentinel values, serialize to XML, then replace the + sentinels with Python format placeholders. + """ + import uuid + + # Use GUIDs for string sentinels - guaranteed no collision with XML content + _RID_SENTINEL = str(uuid.uuid4()) + _FILENAME_SENTINEL = str(uuid.uuid4()) + + # For numeric sentinels, use unique integers derived from UUIDs. + # shape_id is xsd:unsignedInt (max 4,294,967,295 / 32-bit). + # cx/cy are EMU values typed as xsd:long (64-bit). + # All use 9-digit range [100000000, 999999999] to stay within 32-bit + # and avoid any accidental collisions with each other. + _SHAPE_ID = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CX_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CY_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + + inline = CT_Inline.new_pic_inline( + _SHAPE_ID, + _RID_SENTINEL, + _FILENAME_SENTINEL, + Emu(_CX_INT), + Emu(_CY_INT), + ) + xml = inline.xml + + # Replace sentinel values with format string placeholders + xml = xml.replace(str(_SHAPE_ID), "{shape_id}") + xml = xml.replace(_RID_SENTINEL, "{rId}") + xml = xml.replace(_FILENAME_SENTINEL, "{filename}") + xml = xml.replace(str(_CX_INT), "{cx}") + xml = xml.replace(str(_CY_INT), "{cy}") + + return xml + + +# Pre-built XML template for inline images, derived from the installed +# python-docx version. Using str.format() on this template avoids calling +# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() + +# element manipulation + .xml serialization each time). +_INLINE_IMAGE_XML = _build_inline_image_xml_template() class InlineImage(object): @@ -50,16 +104,59 @@ def _add_hyperlink(self, run, url, part): return run def _insert_image(self): - pic = self.tpl.current_rendering_part.new_pic_inline( - self.image_descriptor, - self.width, - self.height, - ).xml + part = self.tpl.current_rendering_part + image_descriptor = self.image_descriptor + + # Cache the expensive parts (image part lookup, rId, dimensions) per + # (part, descriptor, width, height). The XML string itself is NOT + # cached because each insertion needs a unique shape_id - header/footer + # and footnote parts are not renumbered by fix_docpr_ids(). + cache = self.tpl._image_cache + # For hashable descriptors (strings, paths), cache by value. + # For unhashable descriptors (file-like objects), skip caching + # entirely — using id() would risk aliasing after GC. + try: + cache_key = (id(part), image_descriptor, self.width, self.height) + hash(cache_key) is not None # trigger TypeError if unhashable + except TypeError: + cache_key = None + + if cache_key is not None and cache_key in cache: + rId, cx, cy, filename = cache[cache_key] + else: + # Get or add the image part with O(1) descriptor-based dedup, + # avoiding the O(n) linear scan in python-docx's default path. + image_part, image = self.tpl._get_or_add_image_part(image_descriptor) + rId = part.relate_to(image_part, RT.IMAGE) + cx, cy = image.scaled_dimensions(self.width, self.height) + # Escape for use inside XML attribute (quotes must be escaped). + # image.filename is None for file-like descriptors (BytesIO); + # normalize to empty string to match python-docx's behavior. + filename = xml_escape(image.filename or "", {'"': """}) + if cache_key is not None: + cache[cache_key] = (rId, int(cx), int(cy), filename) + + # Always assign a fresh shape_id per insertion so that drawing IDs + # are unique in every part (including headers/footers/footnotes + # which are not renumbered by fix_docpr_ids()). + self.tpl.docx_ids_index += 1 + shape_id = self.tpl.docx_ids_index + + # Generate XML directly as a string using a pre-built template + # rather than calling CT_Inline.new_pic_inline() per image. + pic = _INLINE_IMAGE_XML.format( + cx=int(cx), + cy=int(cy), + shape_id=shape_id, + filename=filename, + rId=rId, + ) + if self.anchor: run = parse_xml(pic) if run.xpath(".//a:blip"): hyperlink = self._add_hyperlink( - run, self.anchor, self.tpl.current_rendering_part + run, self.anchor, part ) pic = hyperlink.xml diff --git a/docxtpl/template.py b/docxtpl/template.py index abcff49..cf339df 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -170,8 +170,113 @@ def render_init(self): self.init_docx() self.pic_map = {} self.current_rendering_part = None - self.docx_ids_index = 1000 + self._image_cache = {} self.is_saved = False + self._init_image_parts_index() + self._init_docx_ids_index() + + def _init_docx_ids_index(self): + """Set docx_ids_index above the maximum existing wp:docPr id. + + fix_docpr_ids() only renumbers the body tree, so IDs in headers, + footers, and footnotes retain their original values. Starting the + counter above the global maximum prevents collisions when inserting + new drawings into any part. + """ + import docx.oxml.ns as _ns + wp_ns = _ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + max_id = 0 + + # Scan all parts (body + headers + footers + footnotes) + for part in self.docx._part._package.parts: + if not hasattr(part, 'blob') or part.blob is None: + continue + # Only scan XML parts that could contain drawings + ct = getattr(part, 'content_type', '') + if not ct.startswith('application/vnd.openxmlformats-officedocument'): + continue + try: + tree = etree.fromstring(part.blob) + except Exception: + continue + for elt in tree.iter(tag): + id_val = elt.get('id') + if id_val is not None: + try: + val = int(id_val) + if val > max_id: + max_id = val + except ValueError: + pass + + # Start above the highest existing ID (minimum 1000 for safety) + self.docx_ids_index = max(max_id, 1000) + + def _init_image_parts_index(self): + """Initialize image-part tracking for fast insertion. + + Uses a descriptor-keyed cache (file path string) for O(1) dedup of + images added during rendering, avoiding expensive content hashing. + """ + package = self.docx._part._package + image_parts = package.image_parts + + # Descriptor-keyed cache: maps image_descriptor -> (image_part, image) + # This is the primary dedup mechanism and avoids expensive content hashing. + self._image_descriptor_index = {} + + # Derive the next partname index by scanning existing partnames once. + # Using len() alone would collide with non-contiguous numbering + # (e.g. image1.png + image3.png → len=2 → next would be image3.ext). + max_index = 0 + for ip in image_parts: + # Partnames follow /word/media/imageN.ext pattern + name = str(ip.partname) + m = re.search(r'/image(\d+)\.', name) + if m: + idx = int(m.group(1)) + if idx > max_index: + max_index = idx + self._image_part_counter = max_index + + def _get_or_add_image_part(self, image_descriptor): + """Return (image_part, image) for the given image_descriptor. + + Uses the descriptor itself (file path) as the dedup key, avoiding + expensive content hashing. Falls back to always creating a new part + for non-hashable descriptors (file-like objects). + """ + from docx.image.image import Image + from docx.opc.packuri import PackURI + from docx.parts.image import ImagePart + + # For string paths, use the path as a cheap dedup key. + cache_key = image_descriptor if isinstance(image_descriptor, str) else None + + if cache_key is not None: + cached = self._image_descriptor_index.get(cache_key) + if cached is not None: + return cached + + image = Image.from_file(image_descriptor) + + # Create image part with sequential partname + self._image_part_counter += 1 + partname = PackURI( + "/word/media/image%d.%s" % (self._image_part_counter, image.ext) + ) + image_part = ImagePart.from_image(image, partname) + + # Add to the package collection + package = self.docx._part._package + package.image_parts.append(image_part) + + result = (image_part, image) + if cache_key is not None: + self._image_descriptor_index[cache_key] = result + + return result def __getattr__(self, name): return getattr(self.docx, name)