所需库及安装
在开始之前,需要安装以下python库:
- python-docx:用于创建和修改word文档
- docxtpl:用于基于模板填充word文档
- docxcompose:用于合并多个word文档
- lxml:xml处理库
可以通过pip安装:
pip install python-docx docxtpl docxcompose lxml
或者使用uv:
uv add python-docx docxtpl docxcompose lxml
1. 基础操作
1.1 创建和保存文档
使用python-docx创建文档非常简单:
from docx import document
doc = document()
doc.add_paragraph("python-docx是一个用于创建")
doc.save("文件1.docx")
1.2 设置中文字体
默认字体对中文支持不佳,需要单独设置中文字体:
from docx.oxml.ns import qn
def set_chinese_font(run, zh_font_name="宋体", en_font_name="times new roman"):
run.font.name = en_font_name
run._element.rpr.rfonts.set(qn("w:eastasia"), zh_font_name)
doc = document()
paragraph = doc.add_paragraph()
run = paragraph.add_run('这是一段设置了中文字体的文本。')
set_chinese_font(run)
doc.save("文件1.docx")
注意:保存文件时,文件不能被打开,否则会报permissionerror错误。
1.3 导入现有文档
doc = document('example.docx')
注意事项:
- 必须是标准docx文件,不能是doc文件
- 不能是strict open xml格式
1.4 遍历文档内容
# 遍历段落
for para in doc.paragraphs[:3]:
print(para)
print(para.text)
print()
# 遍历表格
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
print(cell.text)
2. 文档格式设置
2.1 小标题
doc.add_heading("1.1 transformer整体工作流程", 2)
doc.add_heading("transformer整体架构", 3)
注意:需要文档里有对应的标题样式,否则会报错。
2.2 段落处理
添加段落
text = """transformer 模型由编码器(encoder)和解码器(decoder)组成。...""" paragraph1 = doc.add_paragraph(text)
首行缩进
首行缩进2字符:
paragraph_format = paragraph1.paragraph_format
paragraph_format.first_line_indent = 0
paragraph_format.element.ppr.ind.set(qn("w:firstlinechars"), '200')
首行缩进固定距离:
para_format.first_line_indent = pt(10)
段落对齐
from docx.enum.text import wd_paragraph_alignment paragraph1.alignment = wd_paragraph_alignment.left
删除段落
p = paragraph1._element p.getparent().remove(p)
换行处理
# 将文本按换行符分割成多个段落
for one_paragraph_text in text.split("\n"):
temp_paragraph = doc.add_paragraph(one_paragraph_text)
paragraph_format = temp_paragraph.paragraph_format
paragraph_format.first_line_indent = 0
paragraph_format.element.ppr.ind.set(qn("w:firstlinechars"), "200")
常用段落格式
from docx.shared import pt para_format = temp_paragraph.paragraph_format para_format.line_spacing = pt(18) # 行间距(固定值) para_format.space_before = pt(3) # 段前距离 para_format.space_after = pt(0) # 段后距离 para_format.right_indent = pt(20) # 右侧缩进 para_format.left_indent = pt(0) # 左侧缩进
2.3 字符格式设置
from docx.shared import rgbcolor, pt
# 加粗文本
temp_paragraph.add_run('加粗文本').bold = true
# 红色斜体文本
run = temp_paragraph.add_run('红色斜体文本')
run.font.color.rgb = rgbcolor(255,0,0) # 设置红色
run.font.size = pt(14) # 字号14磅
run.bold = true # 加粗
run.italic = true # 斜体
run.underline = true # 下划线
# 下标和上标
run2 = temp_paragraph.add_run("1")
run2.font.subscript = true # 下标
run3 = temp_paragraph.add_run("2")
run3.font.superscript = true # 上标
2.4 表格处理
创建表格
table = doc.add_table(rows=4, cols=5) table.style = "grid table 1 light" # 应用预定义样式
填充单元格
# 方式1:直接指定单元格 cell = table.cell(0, 1) cell.text = "parrot, possibly dead" # 方式2:通过行获取单元格 row = table.rows[1] cells = row.cells cells[0].text = "foo bar to you." cells[1].text = "and a hearty foo bar to you too sir!"
获取可用表格样式
from docx.enum.style import wd_style_type
styles = doc.styles
for s in styles:
if s.type == wd_style_type.table:
print(s.name)
增加和删除行
# 增加一行
row = table.add_row()
# 删除一行
def remove_row(table, row):
tbl = table._tbl
tr = row._tr
tbl.remove(tr)
row = table.rows[len(table.rows) - 1]
remove_row(table, row)
批量填充数据
# 方式1:一行一行添加
items = (
(7, "1024", "plush kittens"),
(3, "2042", "furbees"),
(1, "1288", "french poodle collars, deluxe"),
)
for item in items:
cells = table.add_row().cells
cells[0].text = str(item[0])
cells[1].text = item[1]
cells[2].text = item[2]
# 方式2:批量填充
for row in table.rows:
for cell in row.cells:
cell.text = "数据单元"
合并单元格
table.cell(0, 0).merge(table.cell(1, 1)) # 跨行列合并
表格格式设置
# 表格宽度自适应 table.autofit = true # 指定行高 from docx.shared import cm table.rows[0].height = cm(0.93) # 修改表格字体大小 table.style.font.size = pt(15) # 设置单元格对齐 from docx.enum.table import wd_align_vertical cell = table.cell(0, 0) cell.paragraphs[0].paragraph_format.alignment = wd_align_paragraph.center cell.vertical_alignment = wd_align_vertical.center # 复制表格 from copy import deepcopy table_copy = deepcopy(doc.tables[0]) para1 = doc.add_paragraph() para1._p.addnext(table_copy._element)
2.5 图片处理
插入图片
from io import bytesio
import base64
# 普通插入
doc.add_picture('图片1.png')
doc.add_picture('图片2.png', width=inches(2.5), height=inches(2))
# 使用base64插入
picture2_base64 = open("图片2base64.txt").read()
img2_buf = base64.b64decode(picture2_base64)
doc.add_picture(bytesio(img2_buf))
# 并排放图
run = doc.add_paragraph().add_run()
run.add_picture("图片1.png", width=inches(2.5), height=inches(2))
run.add_picture("图片1.png", width=inches(2.5), height=inches(2))
2.6 分页符
doc.add_page_break()
2.7 样式管理
# 修改已有样式
doc.styles["normal"].font.size = pt(14)
doc.styles['normal'].font.name = 'arial'
doc.styles['normal']._element.rpr.rfonts.set(qn('w:eastasia'), '楷体')
# 创建自定义段落样式
from docx.enum.style import wd_style_type
userstyle1 = doc.styles.add_style('userstyle1', wd_style_type.paragraph)
userstyle1.font.size = pt(40)
userstyle1.font.color.rgb = rgbcolor(0xff, 0xde, 0x00)
userstyle1.paragraph_format.alignment = wd_align_paragraph.center
userstyle1.font.name = '微软雅黑'
userstyle1._element.rpr.rfonts.set(qn('w:eastasia'), '微软雅黑')
# 使用自定义样式
doc.add_paragraph('自定义段落样式', style=userstyle1)
3. 使用docxtpl进行模板填充
docxtpl可以将word文档制作成模板,实现数据自动填充。
3.1 创建模板
首先创建一个包含占位符的word模板,占位符使用双花括号{{}}包裹。
3.2 填充模板
from docxtpl import docxtemplate, inlineimage, richtext
tpl = docxtemplate("docxexample.docx")
text = """transformer 模型由编码器(encoder)和解码器(decoder)组成..."""
picture1 = inlineimage(tpl, image_descriptor="图片1.png")
# 准备数据
paragraphs1 = [
"步骤1:输入表示(input representation)",
"步骤2:编码器处理(encoder processing)",
"步骤3:解码器处理(decoder processing)",
]
paragraphs2 = [
{"step": 1, "text": "输入向量(词嵌入+位置编码)进入编码器层。"},
{"step": 2, "text": "自注意力子层。"},
{"step": 3, "text": "前馈网络子层。"},
]
table = [
{"character": "并行计算", "description": "编码器可并行处理整个序列(与rnn不同)"},
{"character": "自注意力", "description": "每个词直接关联所有词,捕获长距离依赖"},
{"character": "位置编码", "description": "为无顺序的注意力机制注入位置信息"},
]
alerts = [
{
"date": "2015-03-10",
"desc": richtext("very critical alert", color="ff0000", bold=true),
"type": "critical",
"bg": "ff0000",
},
# ... 其他数据
]
# 渲染模板
context = {
"title": "transformer",
"text_body": text,
"picture1": picture1,
"picture2": picture2,
"paragraphs1": paragraphs1,
"paragraphs2": paragraphs2,
"runs": paragraphs1,
"display_paragraph": true,
"table1": table,
"table2": table,
"alerts": alerts,
}
tpl.render(context)
tpl.save("文件3.docx")
4. 进阶功能
4.1 表格高级操作
设置单元格边框
from docx.oxml import oxmlelement
from docx.oxml.ns import qn
def set_cell_border(cell, **kwargs):
tc = cell._tc
tcpr = tc.get_or_add_tcpr()
tcborders = tcpr.first_child_found_in("w:tcborders")
if tcborders is none:
tcborders = oxmlelement("w:tcborders")
tcpr.append(tcborders)
for edge in ("left", "top", "right", "bottom", "insideh", "insidev"):
edge_data = kwargs.get(edge)
if edge_data:
tag = "w:{}".format(edge)
element = tcborders.find(qn(tag))
if element is none:
element = oxmlelement(tag)
tcborders.append(element)
for key in ["sz", "val", "color", "space", "shadow"]:
if key in edge_data:
element.set(qn("w:{}".format(key)), str(edge_data[key]))
# 使用示例
set_cell_border(
table.cell(0, 0),
top={"sz": 4, "val": "single", "color": "#000000", "space": "0"},
bottom={"sz": 4, "val": "single", "color": "#000000", "space": "0"},
left={"sz": 4, "val": "single", "color": "#000000", "space": "0"},
right={"sz": 4, "val": "single", "color": "#000000", "space": "0"},
)
4.2 超链接
def add_hyperlink(paragraph, url, text):
part = paragraph.part
r_id = part.relate_to(
url,
"http://schemas.openxmlformats.org/officedocument/2006/relationships/hyperlink",
is_external=true,
)
hyperlink = oxmlelement("w:hyperlink")
hyperlink.set(qn("r:id"), r_id)
run = oxmlelement("w:r")
run_text = oxmlelement("w:t")
run_text.text = text
run.append(run_text)
hyperlink.append(run)
paragraph._p.append(hyperlink)
p = doc.add_paragraph("点击访问: ")
add_hyperlink(p, "https://www.baidu.com", "示例链接")
4.3 图片高级操作
提取文档中的图片
import zipfile
from xml.etree.elementtree import fromstring
def extract_images(docx_path, output_dir):
with zipfile.zipfile(docx_path) as z:
try:
doc_rels = z.read('word/_rels/document.xml.rels').decode('utf-8')
except keyerror:
return []
root = fromstring(doc_rels)
rels = []
for child in root:
if 'type' in child.attrib and child.attrib['type'] == rt.image:
rels.append((child.attrib['id'], child.attrib['target']))
images = []
for rel_id, target in rels:
try:
image_data = z.read('word/' + target)
image_name = target.split('/')[-1]
with open(f"{output_dir}/{image_name}", 'wb') as f:
f.write(image_data)
images.append(image_name)
except keyerror:
continue
return images
print(extract_images("transformer原理纯享版.docx", "pictures"))
插入浮动图片
# 插入“衬于文字下方”的浮动图片
# 如将 behinddoc="1" 改成0就是“浮于文字上方”了
# refer to docx.oxml.shape.ct_inline
class ct_anchor(baseoxmlelement):
"""
``<w:anchor>`` element, container for a floating image.
"""
extent = oneandonlyone('wp:extent')
docpr = oneandonlyone('wp:docpr')
graphic = oneandonlyone('a:graphic')
@classmethod
def new(cls, cx, cy, shape_id, pic, pos_x, pos_y):
"""
return a new ``<wp:anchor>`` element populated with the values passed
as parameters.
"""
anchor = parse_xml(cls._anchor_xml(pos_x, pos_y))
anchor.extent.cx = cx
anchor.extent.cy = cy
anchor.docpr.id = shape_id
anchor.docpr.name = 'picture %d' % shape_id
anchor.graphic.graphicdata.uri = (
'http://schemas.openxmlformats.org/drawingml/2006/picture'
)
anchor.graphic.graphicdata._insert_pic(pic)
return anchor
@classmethod
def new_pic_anchor(cls, shape_id, rid, filename, cx, cy, pos_x, pos_y):
"""
return a new `wp:anchor` element containing the `pic:pic` element
specified by the argument values.
"""
pic_id = 0 # word doesn't seem to use this, but does not omit it
pic = ct_picture.new(pic_id, filename, rid, cx, cy)
anchor = cls.new(cx, cy, shape_id, pic, pos_x, pos_y)
anchor.graphic.graphicdata._insert_pic(pic)
return anchor
@classmethod
def _anchor_xml(cls, pos_x, pos_y):
return (
'<wp:anchor distt="0" distb="0" distl="0" distr="0" simplepos="0" relativeheight="0" \n'
' behinddoc="1" locked="0" layoutincell="1" allowoverlap="1" \n'
' %s>\n'
' <wp:simplepos x="0" y="0"/>\n'
' <wp:positionh relativefrom="page">\n'
' <wp:posoffset>%d</wp:posoffset>\n'
' </wp:positionh>\n'
' <wp:positionv relativefrom="page">\n'
' <wp:posoffset>%d</wp:posoffset>\n'
' </wp:positionv>\n'
' <wp:extent cx="914400" cy="914400"/>\n'
' <wp:wrapnone/>\n'
' <wp:docpr id="666" name="unnamed"/>\n'
' <wp:cnvgraphicframepr>\n'
' <a:graphicframelocks nochangeaspect="1"/>\n'
' </wp:cnvgraphicframepr>\n'
' <a:graphic>\n'
' <a:graphicdata uri="uri not set"/>\n'
' </a:graphic>\n'
'</wp:anchor>' % ( nsdecls('wp', 'a', 'pic', 'r'), int(pos_x), int(pos_y) )
)
# refer to docx.parts.story.basestorypart.new_pic_inline
def new_pic_anchor(part, image_descriptor, width, height, pos_x, pos_y):
"""return a newly-created `w:anchor` element.
the element contains the image specified by *image_descriptor* and is scaled
based on the values of *width* and *height*.
"""
rid, image = part.get_or_add_image(image_descriptor)
cx, cy = image.scaled_dimensions(width, height)
shape_id, filename = part.next_id, image.filename
return ct_anchor.new_pic_anchor(shape_id, rid, filename, cx, cy, pos_x, pos_y)
# refer to docx.text.run.add_picture
def add_float_picture(p, image_path_or_stream, width=none, height=none, pos_x=0, pos_y=0):
"""add float picture at fixed position `pos_x` and `pos_y` to the top-left point of page.
"""
run = p.add_run()
anchor = new_pic_anchor(run.part, image_path_or_stream, width, height, pos_x, pos_y)
run._r.add_drawing(anchor)
# refer to docx.oxml.__init__.py
register_element_cls('wp:anchor', ct_anchor)
document = document()
# add a floating picture
p = document.add_paragraph()
add_float_picture(p, '图片1.png')
# add text
p.add_run('hello world '*50)
document.save('文件2.docx')
# https://www.cnblogs.com/dancesir/p/17788854.html
4.4 分栏
# 分2栏
section = doc.sections[0]
sectpr = section._sectpr
cols = sectpr.xpath('./w:cols')[0]
cols.set(qn('w:num'),'2')
4.5 页眉页脚
# 普通页眉
doc = document('transformer原理纯享版.docx')
doc.sections[0].header.paragraphs[0].text = "这是第1节页眉"
# 分奇偶设置页眉
doc.settings.odd_and_even_pages_header_footer = true
doc.sections[0].even_page_header.paragraphs[0].text = "这是偶数页页眉"
doc.sections[0].header.paragraphs[0].text = "这是奇数页页眉"
# 设置首页页眉
doc.sections[0].different_first_page_header_footer = true
doc.sections[0].first_page_header.paragraphs[0].text = "这是首页页眉"
4.6 目录
# 插入目录(不会更新域)
paragraph = doc.paragraphs[0].insert_paragraph_before()
run = paragraph.add_run()
fldchar = oxmlelement('w:fldchar')
fldchar.set(qn('w:fldchartype'), 'begin')
instrtext = oxmlelement('w:instrtext')
instrtext.set(qn('xml:space'), 'preserve')
instrtext.text = r'toc \o "1-3" \h \z \u'
fldchar2 = oxmlelement('w:fldchar')
fldchar2.set(qn('w:fldchartype'), 'separate')
fldchar3 = oxmlelement('w:t')
fldchar3.text = "right-click to update field."
fldchar2.append(fldchar3)
fldchar4 = oxmlelement('w:fldchar')
fldchar4.set(qn('w:fldchartype'), 'end')
r_element = run._r
r_element.append(fldchar)
r_element.append(instrtext)
r_element.append(fldchar2)
r_element.append(fldchar4)
# 自动更新目录
import lxml
name_space = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
update_name_space = "%supdatefields" % name_space
val_name_space = "%sval" % name_space
try:
element_update_field_obj = lxml.etree.subelement(doc.settings.element, update_name_space)
element_update_field_obj.set(val_name_space, "true")
except exception as e:
del e
4.7 文档合并
from docxcompose.composer import composer
master = document("文件1.docx")
composer = composer(master)
doc1 = document("文件2.docx")
composer.append(doc1)
doc2 = document("文件3.docx")
composer.append(doc2)
composer.save("combined.docx")
注意:合并文档时,后面的文档会跟随第一个文档的格式。
总结
本文介绍了python处理word文档的完整流程,包括:
- 使用python-docx进行基础的文档创建、编辑和格式化
- 使用docxtpl实现基于模板的自动化数据填充
- 使用docxcompose合并多个word文档
- 各种进阶功能如设置单元格边框、插入超链接、提取图片、设置页眉页脚等
这些技术可以广泛应用于自动化报告生成、批量文档处理、合同模板填充等场景,大大提高工作效率。
以上就是从基础到进阶详解python处理word文档的完全指南的详细内容,更多关于python处理word文档的资料请关注代码网其它相关文章!
发表评论