diff --git a/docxtpl/inline_image.py b/docxtpl/inline_image.py index f860749..da35bbd 100644 --- a/docxtpl/inline_image.py +++ b/docxtpl/inline_image.py @@ -4,8 +4,62 @@ @author: Eric Lapouyade """ +from xml.sax.saxutils import escape as xml_escape + +from docx.opc.constants import RELATIONSHIP_TYPE as RT from docx.oxml import OxmlElement, parse_xml from docx.oxml.ns import qn +from docx.oxml.shape import CT_Inline +from docx.shared import Emu + + +def _build_inline_image_xml_template(): + """Generate the XML format string by calling python-docx with sentinel values. + + This ensures the template always matches the installed python-docx version's + XML structure, even after upgrades. We call CT_Inline.new_pic_inline() once + with recognizable sentinel values, serialize to XML, then replace the + sentinels with Python format placeholders. + """ + import uuid + + # Use GUIDs for string sentinels - guaranteed no collision with XML content + _RID_SENTINEL = str(uuid.uuid4()) + _FILENAME_SENTINEL = str(uuid.uuid4()) + + # For numeric sentinels, use unique integers derived from UUIDs. + # shape_id is xsd:unsignedInt (max 4,294,967,295 / 32-bit). + # cx/cy are EMU values typed as xsd:long (64-bit). + # All use 9-digit range [100000000, 999999999] to stay within 32-bit + # and avoid any accidental collisions with each other. + _SHAPE_ID = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CX_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + _CY_INT = uuid.uuid4().int % (9 * 10**8) + 10**8 + + inline = CT_Inline.new_pic_inline( + _SHAPE_ID, + _RID_SENTINEL, + _FILENAME_SENTINEL, + Emu(_CX_INT), + Emu(_CY_INT), + ) + xml = inline.xml + + # Replace sentinel values with format string placeholders + xml = xml.replace(str(_SHAPE_ID), "{shape_id}") + xml = xml.replace(_RID_SENTINEL, "{rId}") + xml = xml.replace(_FILENAME_SENTINEL, "{filename}") + xml = xml.replace(str(_CX_INT), "{cx}") + xml = xml.replace(str(_CY_INT), "{cy}") + + return xml + + +# Pre-built XML template for inline images, derived from the installed +# python-docx version. Using str.format() on this template avoids calling +# CT_Inline.new_pic_inline() per image (which does 2x parse_xml() + +# element manipulation + .xml serialization each time). +_INLINE_IMAGE_XML = _build_inline_image_xml_template() class InlineImage(object): @@ -50,16 +104,59 @@ def _add_hyperlink(self, run, url, part): return run def _insert_image(self): - pic = self.tpl.current_rendering_part.new_pic_inline( - self.image_descriptor, - self.width, - self.height, - ).xml + part = self.tpl.current_rendering_part + image_descriptor = self.image_descriptor + + # Cache the expensive parts (image part lookup, rId, dimensions) per + # (part, descriptor, width, height). The XML string itself is NOT + # cached because each insertion needs a unique shape_id - header/footer + # and footnote parts are not renumbered by fix_docpr_ids(). + cache = self.tpl._image_cache + # For hashable descriptors (strings, paths), cache by value. + # For unhashable descriptors (file-like objects), skip caching + # entirely — using id() would risk aliasing after GC. + try: + cache_key = (id(part), image_descriptor, self.width, self.height) + hash(cache_key) is not None # trigger TypeError if unhashable + except TypeError: + cache_key = None + + if cache_key is not None and cache_key in cache: + rId, cx, cy, filename = cache[cache_key] + else: + # Get or add the image part with O(1) descriptor-based dedup, + # avoiding the O(n) linear scan in python-docx's default path. + image_part, image = self.tpl._get_or_add_image_part(image_descriptor) + rId = part.relate_to(image_part, RT.IMAGE) + cx, cy = image.scaled_dimensions(self.width, self.height) + # Escape for use inside XML attribute (quotes must be escaped). + # image.filename is None for file-like descriptors (BytesIO); + # normalize to empty string to match python-docx's behavior. + filename = xml_escape(image.filename or "", {'"': """}) + if cache_key is not None: + cache[cache_key] = (rId, int(cx), int(cy), filename) + + # Always assign a fresh shape_id per insertion so that drawing IDs + # are unique in every part (including headers/footers/footnotes + # which are not renumbered by fix_docpr_ids()). + self.tpl.docx_ids_index += 1 + shape_id = self.tpl.docx_ids_index + + # Generate XML directly as a string using a pre-built template + # rather than calling CT_Inline.new_pic_inline() per image. + pic = _INLINE_IMAGE_XML.format( + cx=int(cx), + cy=int(cy), + shape_id=shape_id, + filename=filename, + rId=rId, + ) + if self.anchor: run = parse_xml(pic) if run.xpath(".//a:blip"): hyperlink = self._add_hyperlink( - run, self.anchor, self.tpl.current_rendering_part + run, self.anchor, part ) pic = hyperlink.xml diff --git a/docxtpl/template.py b/docxtpl/template.py index f20280a..cf339df 100644 --- a/docxtpl/template.py +++ b/docxtpl/template.py @@ -8,17 +8,53 @@ from os import PathLike from typing import TYPE_CHECKING, Any, Optional, IO, Union, Dict, Set -import functools import io from lxml import etree from docx import Document from docx.opc.oxml import parse_xml from docx.opc.part import XmlPart import docx.oxml.ns +from docx.oxml import OxmlElement +from docx.oxml.ns import qn from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE -from jinja2 import Environment, Template, meta +from jinja2 import Environment, meta from jinja2.exceptions import TemplateError + +def _create_optimized_env(**kwargs): + """Create an optimized Jinja2 environment for better performance. + + Optimizations applied: + - auto_reload=False: Skip checking if template source changed + - cache_size=400: Larger template cache for repeated renders + - enable_async=False: Disable async support (not needed, adds overhead) + """ + return Environment( + auto_reload=False, # Disable template auto-reload (faster) + cache_size=400, # Increase template cache size + enable_async=False, # Disable async (not needed, reduces overhead) + **kwargs + ) + + +# Module-level cached environments (created once, reused across all instances) +_CACHED_ENV = None +_CACHED_ENV_AUTOESCAPE = None + + +def _get_cached_env(autoescape=False): + """Get or create a cached Jinja2 environment for performance.""" + global _CACHED_ENV, _CACHED_ENV_AUTOESCAPE + + if autoescape: + if _CACHED_ENV_AUTOESCAPE is None: + _CACHED_ENV_AUTOESCAPE = _create_optimized_env(autoescape=True) + return _CACHED_ENV_AUTOESCAPE + else: + if _CACHED_ENV is None: + _CACHED_ENV = _create_optimized_env(autoescape=False) + return _CACHED_ENV + try: from html import escape # noqa: F401 except ImportError: @@ -43,6 +79,80 @@ class DocxTemplate(object): "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" ) + # Pre-compiled regex patterns for patch_xml() optimization + # These are compiled once at class load time, not on every render + _RE_JINJA_OPEN = re.compile( + r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}#])(<[^>]*>)+(?=\})", + re.DOTALL + ) + _RE_JINJA_CONTENT = re.compile( + r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", + re.DOTALL + ) + _RE_COLSPAN = re.compile( + r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_CELLBG = re.compile( + r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", + re.DOTALL + ) + _RE_SPACE_PRESERVE = re.compile( + r"((?:(?!).)*)({{.*?}}|{%.*?%})", + re.DOTALL + ) + _RE_SPACE_PRESERVE_R = re.compile( + r"({{r\s.*?}}|{%r\s.*?%})", + re.DOTALL + ) + _RE_MERGE_PREV = re.compile(r"(?:(?!).)*?{%-", re.DOTALL) + _RE_MERGE_NEXT = re.compile(r"-%}(?:(?!]|{%|{{).)*?]*?>", re.DOTALL) + _RE_VMERGE = re.compile( + r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", + re.DOTALL + ) + _RE_HMERGE = re.compile( + r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", + re.DOTALL + ) + _RE_CLEAN_TAGS = re.compile(r"(?<=\{[\{%])(.*?)(?=[\}%]})") + _RE_PARAGRAPH_NEWLINE = re.compile(r"])") + _RE_PARAGRAPH_REMOVE_NEWLINE = re.compile(r"\n])") + _RE_STRIPTAGS = re.compile(r".*?(|]*>)", re.DOTALL) + _RE_COLSPAN_EMPTY = re.compile(r"](?:(?!]).)*.*?", re.DOTALL) + _RE_GRIDSPAN = re.compile(r"") + _RE_TCPR = re.compile(r"(]*>)") + _RE_SHD = re.compile(r"") + _RE_RESOLVE_PARAGRAPH = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_RUN = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RESOLVE_TEXT = re.compile(r"]*)?>.*?", re.DOTALL) + _RE_RUN_PROPS = re.compile(r".*?") + _RE_PARA_PROPS = re.compile(r".*?") + + # Pre-compiled patterns for tag-stripping in patch_xml(). + # Strips surrounding tags from {%y ...%} / {{y ...}} template tags. + _RE_TAG_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({%%|{{)%s ([^}%%]*(?:%%}|}})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p", "r") + ) + # Same for {#y ...#} comment tags (not 'r' - comments in runs are uncommon). + _RE_COMMENT_STRIP = tuple( + re.compile( + r"](?:(?!]).)*({#)%s ([^}#]*(?:#})).*?" + % (y, y, y, y), + re.DOTALL, + ) + for y in ("tr", "tc", "p") + ) + + # Precompiled pattern for fast detection of any Jinja syntax in a string. + # Used in render() to skip header/footer processing when no tags are present. + _JINJA_PATTERN = re.compile(r'\{\{|\{%|\{#') + def __init__(self, template_file: Union[IO[bytes], str, PathLike]) -> None: self.template_file = template_file self.reset_replacements() @@ -60,8 +170,113 @@ def render_init(self): self.init_docx() self.pic_map = {} self.current_rendering_part = None - self.docx_ids_index = 1000 + self._image_cache = {} self.is_saved = False + self._init_image_parts_index() + self._init_docx_ids_index() + + def _init_docx_ids_index(self): + """Set docx_ids_index above the maximum existing wp:docPr id. + + fix_docpr_ids() only renumbers the body tree, so IDs in headers, + footers, and footnotes retain their original values. Starting the + counter above the global maximum prevents collisions when inserting + new drawings into any part. + """ + import docx.oxml.ns as _ns + wp_ns = _ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + max_id = 0 + + # Scan all parts (body + headers + footers + footnotes) + for part in self.docx._part._package.parts: + if not hasattr(part, 'blob') or part.blob is None: + continue + # Only scan XML parts that could contain drawings + ct = getattr(part, 'content_type', '') + if not ct.startswith('application/vnd.openxmlformats-officedocument'): + continue + try: + tree = etree.fromstring(part.blob) + except Exception: + continue + for elt in tree.iter(tag): + id_val = elt.get('id') + if id_val is not None: + try: + val = int(id_val) + if val > max_id: + max_id = val + except ValueError: + pass + + # Start above the highest existing ID (minimum 1000 for safety) + self.docx_ids_index = max(max_id, 1000) + + def _init_image_parts_index(self): + """Initialize image-part tracking for fast insertion. + + Uses a descriptor-keyed cache (file path string) for O(1) dedup of + images added during rendering, avoiding expensive content hashing. + """ + package = self.docx._part._package + image_parts = package.image_parts + + # Descriptor-keyed cache: maps image_descriptor -> (image_part, image) + # This is the primary dedup mechanism and avoids expensive content hashing. + self._image_descriptor_index = {} + + # Derive the next partname index by scanning existing partnames once. + # Using len() alone would collide with non-contiguous numbering + # (e.g. image1.png + image3.png → len=2 → next would be image3.ext). + max_index = 0 + for ip in image_parts: + # Partnames follow /word/media/imageN.ext pattern + name = str(ip.partname) + m = re.search(r'/image(\d+)\.', name) + if m: + idx = int(m.group(1)) + if idx > max_index: + max_index = idx + self._image_part_counter = max_index + + def _get_or_add_image_part(self, image_descriptor): + """Return (image_part, image) for the given image_descriptor. + + Uses the descriptor itself (file path) as the dedup key, avoiding + expensive content hashing. Falls back to always creating a new part + for non-hashable descriptors (file-like objects). + """ + from docx.image.image import Image + from docx.opc.packuri import PackURI + from docx.parts.image import ImagePart + + # For string paths, use the path as a cheap dedup key. + cache_key = image_descriptor if isinstance(image_descriptor, str) else None + + if cache_key is not None: + cached = self._image_descriptor_index.get(cache_key) + if cached is not None: + return cached + + image = Image.from_file(image_descriptor) + + # Create image part with sequential partname + self._image_part_counter += 1 + partname = PackURI( + "/word/media/image%d.%s" % (self._image_part_counter, image.ext) + ) + image_part = ImagePart.from_image(image, partname) + + # Add to the package collection + package = self.docx._part._package + package.image_parts.append(image_part) + + result = (image_part, image) + if cache_key is not None: + self._image_descriptor_index[cache_key] = result + + return result def __getattr__(self, name): return getattr(self.docx, name) @@ -88,114 +303,64 @@ def patch_xml(self, src_xml): unescape html entities, etc...""" # replace {{ by {{ ( works with {{ }} {% and %} {# and #}) - src_xml = re.sub( - r"(?<={)(<[^>]*>)+(?=[\{%\#])|(?<=[%\}\#])(<[^>]*>)+(?=\})", - "", - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_OPEN.sub("", src_xml) # replace {{jinja2 stuff}} by {{jinja2 stuff}} # same thing with {% ... %} and {# #} # "jinja2 stuff" could a variable, a 'if' etc... anything jinja2 will understand def striptags(m): - return re.sub( - ".*?(|]*>)", "", m.group(0), flags=re.DOTALL - ) + return self._RE_STRIPTAGS.sub("", m.group(0)) - src_xml = re.sub( - r"{%(?:(?!%}).)*|{#(?:(?!#}).)*|{{(?:(?!}}).)*", - striptags, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_JINJA_CONTENT.sub(striptags, src_xml) # manage table cell colspan def colspan(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_GRIDSPAN.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)", - colspan, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_COLSPAN.sub(colspan, src_xml) # manage table cell background color def cellbg(m): cell_xml = m.group(1) + m.group(3) - cell_xml = re.sub( - r"](?:(?!]).)*.*?", - "", - cell_xml, - flags=re.DOTALL, - ) - cell_xml = re.sub(r"", "", cell_xml, count=1) - return re.sub( - r"(]*>)", + cell_xml = self._RE_COLSPAN_EMPTY.sub("", cell_xml) + cell_xml = self._RE_SHD.sub("", cell_xml, count=1) + return self._RE_TCPR.sub( r'\1' % m.group(2), cell_xml, ) - src_xml = re.sub( - r"(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)", - cellbg, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_CELLBG.sub(cellbg, src_xml) # ensure space preservation - src_xml = re.sub( - r"((?:(?!).)*)({{.*?}}|{%.*?%})", + src_xml = self._RE_SPACE_PRESERVE.sub( r'\1\2', src_xml, - flags=re.DOTALL, ) - src_xml = re.sub( - r"({{r\s.*?}}|{%r\s.*?%})", + src_xml = self._RE_SPACE_PRESERVE_R.sub( r'\1', src_xml, - flags=re.DOTALL, ) # {%- will merge with previous paragraph text - src_xml = re.sub(r"(?:(?!).)*?{%-", "{%", src_xml, flags=re.DOTALL) + src_xml = self._RE_MERGE_PREV.sub("{%", src_xml) # -%} will merge with next paragraph text - src_xml = re.sub( - r"-%}(?:(?!]|{%|{{).)*?]*?>", "%}", src_xml, flags=re.DOTALL - ) + src_xml = self._RE_MERGE_NEXT.sub("%}", src_xml) - for y in ["tr", "tc", "p", "r"]: - # replace into xml code the row/paragraph/run containing - # {%y xxx %} or {{y xxx}} template tag - # by {% xxx %} or {{ xx }} without any surrounding tags : - # This is mandatory to have jinja2 generating correct xml code - pat = ( - r"](?:(?!]).)*({%%|{{)%(y)s ([^}%%]*(?:%%}|}})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) - - for y in ["tr", "tc", "p"]: - # same thing, but for {#y xxx #} (but not where y == 'r', since that - # makes less sense to use comments in that context - pat = ( - r"](?:(?!]).)*({#)%(y)s ([^}#]*(?:#})).*?" - % {"y": y} - ) - src_xml = re.sub(pat, r"\1 \2", src_xml, flags=re.DOTALL) + # Strip surrounding tags from {%y ...%} / {{y ...}} template tags. + # This is mandatory for jinja2 to generate correct xml code. + # Patterns are pre-compiled as class attributes to avoid recompilation. + for pat in self._RE_TAG_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) + + # Same for {#y ...#} comment tags (not 'r' — comments in runs are uncommon). + for pat in self._RE_COMMENT_STRIP: + src_xml = pat.sub(r"\1 \2", src_xml) # add vMerge # use {% vm %} to make this table cell and its copies @@ -220,12 +385,7 @@ def v_merge(m1): flags=re.DOTALL, ) - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*vm\s*%}.*?]", - v_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_VMERGE.sub(v_merge_tc, src_xml) # Use ``{% hm %}`` to make table cell become horizontally merged within # a ``{% for %}``. @@ -279,12 +439,7 @@ def without_gridspan(m2): # Discard every other cell generated in loop. return "{% if loop.first %}" + xml + "{% endif %}" - src_xml = re.sub( - r"](?:(?!]).)*?{%\s*hm\s*%}.*?]", - h_merge_tc, - src_xml, - flags=re.DOTALL, - ) + src_xml = self._RE_HMERGE.sub(h_merge_tc, src_xml) def clean_tags(m): return ( @@ -298,18 +453,17 @@ def clean_tags(m): .replace("’", "'") ) - src_xml = re.sub(r"(?<=\{[\{%])(.*?)(?=[\}%]})", clean_tags, src_xml) + src_xml = self._RE_CLEAN_TAGS.sub(clean_tags, src_xml) return src_xml def render_xml_part(self, src_xml, part, context, jinja_env=None): - src_xml = re.sub(r"])", r"\n])", r" None: if jinja_env is None: - jinja_env = Environment() + jinja_env = _get_cached_env() for section in self.docx.sections: for part in section.part.package.parts: @@ -378,6 +532,10 @@ def render_footnotes( part._blob = xml.encode("utf-8") def resolve_listing(self, xml): + # Early exit: if no Listing special characters are present (common case), + # there's nothing to resolve, skip the work below. + if "\t" not in xml and "\n" not in xml and "\a" not in xml and "\f" not in xml: + return xml def resolve_text(run_properties, paragraph_properties, m): xml = m.group(0).replace( @@ -403,30 +561,24 @@ def resolve_text(run_properties, paragraph_properties, m): return xml def resolve_run(paragraph_properties, m): - run_properties = re.search(r".*?", m.group(0)) + run_properties = self._RE_RUN_PROPS.search(m.group(0)) run_properties = run_properties.group(0) if run_properties else "" - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_TEXT.sub( lambda x: resolve_text(run_properties, paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) def resolve_paragraph(m): - paragraph_properties = re.search(r".*?", m.group(0)) + paragraph_properties = self._RE_PARA_PROPS.search(m.group(0)) paragraph_properties = ( paragraph_properties.group(0) if paragraph_properties else "" ) - return re.sub( - r"]*)?>.*?", + return self._RE_RESOLVE_RUN.sub( lambda x: resolve_run(paragraph_properties, x), m.group(0), - flags=re.DOTALL, ) - xml = re.sub( - r"]*)?>.*?", resolve_paragraph, xml, flags=re.DOTALL - ) + xml = self._RE_RESOLVE_PARAGRAPH.sub(resolve_paragraph, xml) return xml @@ -437,9 +589,57 @@ def build_xml(self, context, jinja_env=None): return xml def map_tree(self, tree): + """Replace the body element with the rendered tree. + + Instead of iterating over all body children to remove/re-append them + one-by-one (O(n) lxml operations, each with internal bookkeeping), + we swap the entire element in the document root using + root.remove() + root.insert(). This is O(1) since the root element + () has only a handful of direct children. + + The body's index is located first so document element order is + preserved (e.g. body before sectPr). + + SAFETY: If the body is not a direct child of root (malformed template) + or if remove/insert raises for any reason, we fall back to the slower + child-by-child copy so rendering is never broken. + """ root = self.docx._element - body = root.body - root.replace(body, tree) + old_body = root.body + + # Find where sits among root's direct children so we can + # re-insert the new tree at the same position. + body_index = None + for i, child in enumerate(root): + if child is old_body: + body_index = i + break + + if body_index is None: + # Malformed template – body is not a direct child of root. + # Fall back to child-by-child replacement on the existing body. + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) + return + + try: + # Detach the old body and insert the new tree (which is itself a + # element returned by fix_tables/parse_xml) at the same + # position. This avoids O(n) per-child remove/append calls. + root.remove(old_body) + root.insert(body_index, tree) + except Exception: + # If something went wrong, restore the document to a usable state + # by re-attaching the old body (if it was already detached) and + # falling back to child-by-child copy. + if old_body.getparent() is None: + root.insert(body_index, old_body) + for child in list(old_body): + old_body.remove(child) + for child in list(tree): + old_body.append(child) def get_headers_footers(self, uri): for relKey, val in self.docx._part.rels.items(): @@ -458,10 +658,17 @@ def get_headers_footers_encoding(self, xml): def build_headers_footers_xml(self, context, uri, jinja_env=None): for relKey, part in self.get_headers_footers(uri): xml = self.get_part_xml(part) - encoding = self.get_headers_footers_encoding(xml) - xml = self.patch_xml(xml) - xml = self.render_xml_part(xml, part, context, jinja_env) - yield relKey, xml.encode(encoding) + + # Skip rendering if no Jinja tags present + # Headers/footers are often static, so this avoids caching/parsing overhead + if self._RE_JINJA_OPEN.search(xml) or self._RE_JINJA_CONTENT.search(xml): + encoding = self.get_headers_footers_encoding(xml) + xml = self.patch_xml(xml) + xml = self.render_xml_part(xml, part, context, jinja_env) + yield relKey, xml.encode(encoding) + else: + encoding = self.get_headers_footers_encoding(xml) + yield relKey, xml.encode(encoding) def map_headers_footers_xml(self, relKey, xml): part = self.docx._part.rels[relKey].target_part @@ -479,58 +686,105 @@ def render( # init template working attributes self.render_init() - if autoescape: - if not jinja_env: - jinja_env = Environment(autoescape=autoescape) - else: - jinja_env.autoescape = autoescape + # Use cached environment by default + if not jinja_env: + jinja_env = _get_cached_env(autoescape=autoescape) + elif autoescape: + jinja_env.autoescape = autoescape # Body xml_src = self.build_xml(context, jinja_env) - # fix tables if needed + # Fix tables if needed tree = self.fix_tables(xml_src) - # fix docPr ID's + # Fix docPr ID's self.fix_docpr_ids(tree) # Replace body xml tree self.map_tree(tree) - # Headers - headers = self.build_headers_footers_xml(context, self.HEADER_URI, jinja_env) - for relKey, xml in headers: - self.map_headers_footers_xml(relKey, xml) - - # Footers - footers = self.build_headers_footers_xml(context, self.FOOTER_URI, jinja_env) - for relKey, xml in footers: - self.map_headers_footers_xml(relKey, xml) - + # Headers & Footers - skip when no Jinja tags are present. + # Uses both _JINJA_PATTERN (intact tags) and _RE_JINJA_OPEN (tags + # split across XML runs by Word). + for uri in (self.HEADER_URI, self.FOOTER_URI): + try: + has_jinja = any( + self._JINJA_PATTERN.search(xml) + or self._RE_JINJA_OPEN.search(xml) + for xml in ( + self.get_part_xml(part) + for _relKey, part in self.get_headers_footers(uri) + ) + ) + if has_jinja: + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) + except Exception: + # Fallback: guards against unexpected part structure (e.g. blob + # is None, missing attributes). Not malformed XML - that would + # fail in build_headers_footers_xml regardless. + for relKey, xml in self.build_headers_footers_xml(context, uri, jinja_env): + self.map_headers_footers_xml(relKey, xml) + + # Properties: no skip-check needed - these are a handful of short + # strings (author, title, etc.) where from_string() is near-zero cost. self.render_properties(context, jinja_env) + # Footnotes: no skip-check needed - at most one part exists in typical + # documents, and many have none, so the loop body rarely executes. self.render_footnotes(context, jinja_env) # set rendered flag self.is_rendered = True - # using of TC tag in for cycle can cause that count of columns does not - # correspond to real count of columns in row. This function is able to fix it. + # Using of TC tag in for cycle can cause that count of columns does not + # correspond to real count of columns in row. def fix_tables(self, xml): - parser = etree.XMLParser(recover=True) - tree = etree.fromstring(xml, parser=parser) + # Use parse_xml with safe fallback for malformed XML + try: + tree = parse_xml(xml) # parse_xml() is significantly faster + except Exception: + # Fallback to permissive parser in the event of malformed XML + parser = etree.XMLParser(recover=True) + tree = etree.fromstring(xml, parser=parser) # get namespace ns = "{" + tree.nsmap["w"] + "}" # walk trough xml and find table for t in tree.iter(ns + "tbl"): tblGrid = t.find(ns + "tblGrid") + if tblGrid is None: + continue + columns = tblGrid.findall(ns + "gridCol") - to_add = 0 - # walk trough all rows and try to find if there is higher cell count + columns_len = len(columns) + + # Single pass row analysis with both counters + max_raw_cells = 0 # For ADD decision (raw tc count) + max_effective_cells = 0 # For REMOVE decision (with gridSpan) + for r in t.iter(ns + "tr"): cells = r.findall(ns + "tc") - if (len(columns) + to_add) < len(cells): - to_add = len(cells) - len(columns) + raw_count = len(cells) + effective_count = 0 + + for cell in cells: + tc_pr = cell.find(ns + "tcPr") + if tc_pr is not None: + grid_span = tc_pr.find(ns + "gridSpan") + if grid_span is not None: + effective_count += int(grid_span.get(ns + "val")) + continue + effective_count += 1 + + if raw_count > max_raw_cells: + max_raw_cells = raw_count + if effective_count > max_effective_cells: + max_effective_cells = effective_count + + # ADD columns based on RAW cell count (original behavior) + to_add = max_raw_cells - columns_len if max_raw_cells > columns_len else 0 + # is necessary to add columns? if to_add > 0: # at first, calculate width of table according to columns @@ -552,34 +806,16 @@ def fix_tables(self, xml): int(float(c.get(ns + "w")) * new_average / old_average) ), ) - # add new columns + # add new columns using OxmlElement for proper python-docx compatibility for i in range(to_add): - etree.SubElement( - tblGrid, ns + "gridCol", {ns + "w": str(int(new_average))} - ) + new_col = OxmlElement('w:gridCol') + new_col.set(qn('w:w'), str(int(new_average))) + tblGrid.append(new_col) - # Refetch columns after columns addition. + # REMOVE columns based on EFFECTIVE cell count (original behavior) columns = tblGrid.findall(ns + "gridCol") columns_len = len(columns) - - cells_len_max = 0 - - def get_cell_len(total, cell): - tc_pr = cell.find(ns + "tcPr") - grid_span = None if tc_pr is None else tc_pr.find(ns + "gridSpan") - - if grid_span is not None: - return total + int(grid_span.get(ns + "val")) - - return total + 1 - - # Calculate max of table cells to compare with `gridCol`. - for r in t.iter(ns + "tr"): - cells = r.findall(ns + "tc") - cells_len = functools.reduce(get_cell_len, cells, 0) - cells_len_max = max(cells_len_max, cells_len) - - to_remove = columns_len - cells_len_max + to_remove = columns_len - max_effective_cells if columns_len > max_effective_cells else 0 # If after the loop, there're less columns, than # originally was, remove extra `gridCol` declarations. @@ -608,8 +844,11 @@ def get_cell_len(total, cell): return tree def fix_docpr_ids(self, tree): - # some Ids may have some collisions : so renumbering all of them : - for elt in tree.xpath("//wp:docPr", namespaces=docx.oxml.ns.nsmap): + # Some Ids may have some collisions : so renumbering all of them + wp_ns = docx.oxml.ns.nsmap['wp'] + tag = "{%s}docPr" % wp_ns + + for elt in tree.iter(tag): self.docx_ids_index += 1 elt.attrib["id"] = str(self.docx_ids_index) @@ -913,7 +1152,7 @@ def get_undeclared_template_variables( if jinja_env: env = jinja_env else: - env = Environment() + env = _get_cached_env() parse_content = env.parse(xml) all_variables = meta.find_undeclared_variables(parse_content) diff --git a/pyproject.toml b/pyproject.toml index 476fc0e..28533c4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,10 @@ repository = "https://github.com/elapouya/python-docx-template.git" document = "https://docxtpl.readthedocs.org" [tool.poetry] +name = "docxtpl" version = "0.0.0" +description = "Python docx template engine" +authors = ["Eric Lapouyade "] [tool.poetry.requires-plugins] poetry-dynamic-versioning = { version = ">=1.0.0,<2.0.0", extras = ["plugin"] }