Skip to content
This repository has been archived by the owner on Apr 16, 2022. It is now read-only.

add code to handle image merge #51

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 83 additions & 30 deletions mailmerge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,15 @@
from lxml.etree import Element
from lxml import etree
from zipfile import ZipFile, ZIP_DEFLATED
from random import randint

NAMESPACES = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
}

CONTENT_TYPES_PARTS = (
Expand All @@ -26,6 +30,12 @@ def __init__(self, file, remove_empty_tables=False):
self.parts = {}
self.settings = None
self._settings_info = None

self.media = {} # new images to add indexed by embed id
self.rels = None # etree for relationships
self._rels_info = None # zi info block for rels
self.RELS_NAMESPACES = {'ns': None, 'od': None}

self.remove_empty_tables = remove_empty_tables

try:
Expand All @@ -37,6 +47,13 @@ def __init__(self, file, remove_empty_tables=False):
elif type == CONTENT_TYPE_SETTINGS:
self._settings_info, self.settings = self.__get_tree_of_file(file)

# get the rels for image mappings
try:
self._rels_info, self.rels = self.__get_tree_of_file('word/_rels/document.xml.rels')
self.RELS_NAMESPACES['ns'] = self.rels.getroot().nsmap.get(None)
self.RELS_NAMESPACES['od'] = self.rels.getroot().nsmap.get(None).replace('package', 'officeDocument')
except:
pass
to_delete = []

r = re.compile(r' MERGEFIELD +"?([^ ]+?)"? +(|\\\* MERGEFORMAT )', re.I)
Expand Down Expand Up @@ -108,7 +125,10 @@ def __init__(self, file, remove_empty_tables=False):
raise

def __get_tree_of_file(self, file):
fn = file.attrib['PartName' % NAMESPACES].split('/', 1)[1]
if isinstance(file, etree._Element):
fn = file.get('PartName').split('/', 1)[1]
else:
fn = file
zi = self.zip.getinfo(fn)
return zi, etree.parse(self.zip.open(zi))

Expand All @@ -125,8 +145,14 @@ def write(self, file):
elif zi == self._settings_info:
xml = etree.tostring(self.settings.getroot())
output.writestr(zi.filename, xml)
elif zi == self._rels_info:
xml = etree.tostring(self.rels.getroot())
output.writestr(zi.filename, xml)
else:
output.writestr(zi.filename, self.zip.read(zi))
# add new images to media folder is we have images merged
for img_id, img_data in self.media.items():
output.writestr('media/{}.png'.format(img_id), img_data)

def get_merge_fields(self, parts=None):
if not parts:
Expand All @@ -141,7 +167,7 @@ def merge_templates(self, replacements, separator):
"""
Duplicate template. Creates a copy of the template, does a merge, and separates them by a new paragraph, a new break or a new section break.
separator must be :
- page_break : Page Break.
- page_break : Page Break.
- column_break : Column Break. ONLY HAVE EFFECT IF DOCUMENT HAVE COLUMNS
- textWrapping_break : Line Break.
- continuous_section : Continuous section break. Begins the section on the next paragraph.
Expand All @@ -151,58 +177,58 @@ def merge_templates(self, replacements, separator):
- oddPage_section : oddPage section break. section begins on the next odd-numbered page, leaving the next even page blank if necessary.
"""

#TYPE PARAM CONTROL AND SPLIT
valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section', 'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'}
# TYPE PARAM CONTROL AND SPLIT
valid_separators = {'page_break', 'column_break', 'textWrapping_break', 'continuous_section',
'evenPage_section', 'nextColumn_section', 'nextPage_section', 'oddPage_section'}
if not separator in valid_separators:
raise ValueError("Invalid separator argument")
type, sepClass = separator.split("_")


#GET ROOT - WORK WITH DOCUMENT
# GET ROOT - WORK WITH DOCUMENT
for part in self.parts.values():
root = part.getroot()
tag = root.tag
if tag == '{%(w)s}ftr' % NAMESPACES or tag == '{%(w)s}hdr' % NAMESPACES:
continue

if sepClass == 'section':

#FINDING FIRST SECTION OF THE DOCUMENT
# FINDING FIRST SECTION OF THE DOCUMENT
firstSection = root.find("w:body/w:p/w:pPr/w:sectPr", namespaces=NAMESPACES)
if firstSection == None:
firstSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)
#MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING

# MODIFY TYPE ATTRIBUTE OF FIRST SECTION FOR MERGING
nextPageSec = deepcopy(firstSection)
for child in nextPageSec:
#Delete old type if exist
# Delete old type if exist
if child.tag == '{%(w)s}type' % NAMESPACES:
nextPageSec.remove(child)
#Create new type (def parameter)
newType = etree.SubElement(nextPageSec, '{%(w)s}type' % NAMESPACES)
newType.set('{%(w)s}val' % NAMESPACES, type)
# Create new type (def parameter)
newType = etree.SubElement(nextPageSec, '{%(w)s}type' % NAMESPACES)
newType.set('{%(w)s}val' % NAMESPACES, type)

#REPLACING FIRST SECTION
# REPLACING FIRST SECTION
secRoot = firstSection.getparent()
secRoot.replace(firstSection, nextPageSec)

#FINDING LAST SECTION OF THE DOCUMENT
# FINDING LAST SECTION OF THE DOCUMENT
lastSection = root.find("w:body/w:sectPr", namespaces=NAMESPACES)

#SAVING LAST SECTION
# SAVING LAST SECTION
mainSection = deepcopy(lastSection)
lsecRoot = lastSection.getparent()
lsecRoot.remove(lastSection)

#COPY CHILDREN ELEMENTS OF BODY IN A LIST
# COPY CHILDREN ELEMENTS OF BODY IN A LIST
childrenList = root.findall('w:body/*', namespaces=NAMESPACES)

#DELETE ALL CHILDREN OF BODY
# DELETE ALL CHILDREN OF BODY
for child in root:
if child.tag == '{%(w)s}body' % NAMESPACES:
child.clear()

#REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT
# REFILL BODY AND MERGE DOCS - ADD LAST SECTION ENCAPSULATED OR NOT
lr = len(replacements)
lc = len(childrenList)
parts = []
Expand All @@ -220,27 +246,27 @@ def merge_templates(self, replacements, separator):
else:
if sepClass == 'section':
intSection = deepcopy(mainSection)
p = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
pPr = etree.SubElement(p, '{%(w)s}pPr' % NAMESPACES)
p = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
pPr = etree.SubElement(p, '{%(w)s}pPr' % NAMESPACES)
pPr.append(intSection)
parts.append(p)
elif sepClass == 'break':
pb = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
r = etree.SubElement(pb, '{%(w)s}r' % NAMESPACES)
pb = etree.SubElement(child, '{%(w)s}p' % NAMESPACES)
r = etree.SubElement(pb, '{%(w)s}r' % NAMESPACES)
nbreak = Element('{%(w)s}br' % NAMESPACES)
nbreak.attrib['{%(w)s}type' % NAMESPACES] = type
r.append(nbreak)

self.merge(parts, **repl)

def merge_pages(self, replacements):
"""
Deprecated method.
"""
warnings.warn("merge_pages has been deprecated in favour of merge_templates",
"""
Deprecated method.
"""
warnings.warn("merge_pages has been deprecated in favour of merge_templates",
category=DeprecationWarning,
stacklevel=2)
self.merge_templates(replacements, "page_break")
stacklevel=2)
self.merge_templates(replacements, "page_break")

def merge(self, parts=None, **replacements):
if not parts:
Expand All @@ -254,6 +280,33 @@ def merge(self, parts=None, **replacements):
self.__merge_field(part, field, replacement)

def __merge_field(self, part, field, text):
if field.startswith('IMAGE:'):
_, img_name = field.split(':')
inline_img_el = part.find('.//wp:docPr[@title="{}"]/..'.format(img_name), namespaces=NAMESPACES)
if inline_img_el:
embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES)
if embed_node:
# generate a random id and add tp media list for later export to media folder in zip file
img_id = 'MMR{}'.format(randint(10000000, 999999999))
self.media[img_id] = text

# add a relationship
last_img_relationship = \
self.rels.findall('{%(ns)s}Relationship[@Type="%(od)s/image"]' % self.RELS_NAMESPACES)[-1]
new_img_relationship = deepcopy(last_img_relationship)
new_img_relationship.set('Id', img_id)
new_img_relationship.set('Target', '/media/{}.png'.format(img_id))
self.rels.getroot().append(new_img_relationship)

# replace the embed attrib with the new image_id
embed_node = inline_img_el.find('.//a:blip', namespaces=NAMESPACES)
embed_attr = embed_node.attrib.keys()[0]
embed_node.attrib[embed_attr] = img_id
# mark as done
inline_img_el.find('wp:docPr', namespaces=NAMESPACES).attrib['title'] = 'replaced_image_{}'.format(
img_id)
return

for mf in part.findall('.//MergeField[@name="%s"]' % field):
children = list(mf)
mf.clear() # clear away the attributes
Expand Down