Skip to content
Merged
109 changes: 103 additions & 6 deletions docxtpl/inline_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,62 @@

@author: Eric Lapouyade
"""
from xml.sax.saxutils import escape as xml_escape

from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx.oxml import OxmlElement, parse_xml
from docx.oxml.ns import qn
from docx.oxml.shape import CT_Inline
from docx.shared import Emu


def _build_inline_image_xml_template():
"""Generate the XML format string by calling python-docx with sentinel values.

This ensures the template always matches the installed python-docx version's
XML structure, even after upgrades. We call CT_Inline.new_pic_inline() once
with recognizable sentinel values, serialize to XML, then replace the
sentinels with Python format placeholders.
"""
import uuid

# Use GUIDs for string sentinels - guaranteed no collision with XML content
_RID_SENTINEL = str(uuid.uuid4())
_FILENAME_SENTINEL = str(uuid.uuid4())

# For numeric sentinels, use unique integers derived from UUIDs.
# shape_id is xsd:unsignedInt (max 4,294,967,295 / 32-bit).
# cx/cy are EMU values typed as xsd:long (64-bit).
# All use 9-digit range [100000000, 999999999] to stay within 32-bit
# and avoid any accidental collisions with each other.
_SHAPE_ID = uuid.uuid4().int % (9 * 10**8) + 10**8
_CX_INT = uuid.uuid4().int % (9 * 10**8) + 10**8
_CY_INT = uuid.uuid4().int % (9 * 10**8) + 10**8

inline = CT_Inline.new_pic_inline(
_SHAPE_ID,
_RID_SENTINEL,
_FILENAME_SENTINEL,
Emu(_CX_INT),
Emu(_CY_INT),
)
xml = inline.xml

# Replace sentinel values with format string placeholders
xml = xml.replace(str(_SHAPE_ID), "{shape_id}")
xml = xml.replace(_RID_SENTINEL, "{rId}")
xml = xml.replace(_FILENAME_SENTINEL, "{filename}")
xml = xml.replace(str(_CX_INT), "{cx}")
xml = xml.replace(str(_CY_INT), "{cy}")

return xml


# Pre-built XML template for inline images, derived from the installed
# python-docx version. Using str.format() on this template avoids calling
# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() +
# element manipulation + .xml serialization each time).
_INLINE_IMAGE_XML = _build_inline_image_xml_template()


class InlineImage(object):
Expand Down Expand Up @@ -50,16 +104,59 @@ def _add_hyperlink(self, run, url, part):
return run

def _insert_image(self):
pic = self.tpl.current_rendering_part.new_pic_inline(
self.image_descriptor,
self.width,
self.height,
).xml
part = self.tpl.current_rendering_part
image_descriptor = self.image_descriptor

# Cache the expensive parts (image part lookup, rId, dimensions) per
# (part, descriptor, width, height). The XML string itself is NOT
# cached because each insertion needs a unique shape_id - header/footer
# and footnote parts are not renumbered by fix_docpr_ids().
cache = self.tpl._image_cache
# For hashable descriptors (strings, paths), cache by value.
# For unhashable descriptors (file-like objects), skip caching
# entirely — using id() would risk aliasing after GC.
try:
cache_key = (id(part), image_descriptor, self.width, self.height)
hash(cache_key) is not None # trigger TypeError if unhashable
except TypeError:
cache_key = None

if cache_key is not None and cache_key in cache:
rId, cx, cy, filename = cache[cache_key]
else:
# Get or add the image part with O(1) descriptor-based dedup,
# avoiding the O(n) linear scan in python-docx's default path.
image_part, image = self.tpl._get_or_add_image_part(image_descriptor)
rId = part.relate_to(image_part, RT.IMAGE)
cx, cy = image.scaled_dimensions(self.width, self.height)
# Escape for use inside XML attribute (quotes must be escaped).
# image.filename is None for file-like descriptors (BytesIO);
# normalize to empty string to match python-docx's behavior.
filename = xml_escape(image.filename or "", {'"': """})
if cache_key is not None:
cache[cache_key] = (rId, int(cx), int(cy), filename)

# Always assign a fresh shape_id per insertion so that drawing IDs
# are unique in every part (including headers/footers/footnotes
# which are not renumbered by fix_docpr_ids()).
self.tpl.docx_ids_index += 1
shape_id = self.tpl.docx_ids_index

# Generate XML directly as a string using a pre-built template
# rather than calling CT_Inline.new_pic_inline() per image.
pic = _INLINE_IMAGE_XML.format(
cx=int(cx),
cy=int(cy),
shape_id=shape_id,
filename=filename,
rId=rId,
)

if self.anchor:
run = parse_xml(pic)
if run.xpath(".//a:blip"):
hyperlink = self._add_hyperlink(
run, self.anchor, self.tpl.current_rendering_part
run, self.anchor, part
)
pic = hyperlink.xml

Expand Down
107 changes: 106 additions & 1 deletion docxtpl/template.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,113 @@ def render_init(self):
self.init_docx()
self.pic_map = {}
self.current_rendering_part = None
self.docx_ids_index = 1000
self._image_cache = {}
self.is_saved = False
self._init_image_parts_index()
self._init_docx_ids_index()

def _init_docx_ids_index(self):
"""Set docx_ids_index above the maximum existing wp:docPr id.

fix_docpr_ids() only renumbers the body tree, so IDs in headers,
footers, and footnotes retain their original values. Starting the
counter above the global maximum prevents collisions when inserting
new drawings into any part.
"""
import docx.oxml.ns as _ns
wp_ns = _ns.nsmap['wp']
tag = "{%s}docPr" % wp_ns
max_id = 0

# Scan all parts (body + headers + footers + footnotes)
for part in self.docx._part._package.parts:
if not hasattr(part, 'blob') or part.blob is None:
continue
# Only scan XML parts that could contain drawings
ct = getattr(part, 'content_type', '')
if not ct.startswith('application/vnd.openxmlformats-officedocument'):
continue
try:
tree = etree.fromstring(part.blob)
except Exception:
continue
for elt in tree.iter(tag):
id_val = elt.get('id')
if id_val is not None:
try:
val = int(id_val)
if val > max_id:
max_id = val
except ValueError:
pass

# Start above the highest existing ID (minimum 1000 for safety)
self.docx_ids_index = max(max_id, 1000)

def _init_image_parts_index(self):
"""Initialize image-part tracking for fast insertion.

Uses a descriptor-keyed cache (file path string) for O(1) dedup of
images added during rendering, avoiding expensive content hashing.
"""
package = self.docx._part._package
image_parts = package.image_parts

# Descriptor-keyed cache: maps image_descriptor -> (image_part, image)
# This is the primary dedup mechanism and avoids expensive content hashing.
self._image_descriptor_index = {}

# Derive the next partname index by scanning existing partnames once.
# Using len() alone would collide with non-contiguous numbering
# (e.g. image1.png + image3.png → len=2 → next would be image3.ext).
max_index = 0
for ip in image_parts:
# Partnames follow /word/media/imageN.ext pattern
name = str(ip.partname)
m = re.search(r'/image(\d+)\.', name)
if m:
idx = int(m.group(1))
if idx > max_index:
max_index = idx
self._image_part_counter = max_index

def _get_or_add_image_part(self, image_descriptor):
"""Return (image_part, image) for the given image_descriptor.

Uses the descriptor itself (file path) as the dedup key, avoiding
expensive content hashing. Falls back to always creating a new part
for non-hashable descriptors (file-like objects).
"""
from docx.image.image import Image
from docx.opc.packuri import PackURI
from docx.parts.image import ImagePart

# For string paths, use the path as a cheap dedup key.
cache_key = image_descriptor if isinstance(image_descriptor, str) else None

if cache_key is not None:
cached = self._image_descriptor_index.get(cache_key)
if cached is not None:
return cached

image = Image.from_file(image_descriptor)

# Create image part with sequential partname
self._image_part_counter += 1
partname = PackURI(
"/word/media/image%d.%s" % (self._image_part_counter, image.ext)
)
image_part = ImagePart.from_image(image, partname)

# Add to the package collection
package = self.docx._part._package
package.image_parts.append(image_part)

result = (image_part, image)
if cache_key is not None:
self._image_descriptor_index[cache_key] = result

return result

def __getattr__(self, name):
return getattr(self.docx, name)
Expand Down
Loading