方法一:使用 docx2pdf 和 pdf2image(推荐)
安装所需库
pip install python-docx docx2pdf pdf2image pillow
# 还需要安装poppler(Windows用户)
# Windows下载:https://github.com/oschwartz10612/poppler-windows/releases
完整代码实现
import os
from docx2pdf import convert
from pdf2image import convert_from_path
from PIL import Image
import tempfile
def word_to_images_via_pdf(docx_path, output_folder=None, dpi=200):
"""
通过PDF中间格式将Word转换为图片
参数:
docx_path: Word文档路径
output_folder: 输出文件夹(默认与文档同目录)
dpi: 图片分辨率
"""
# 检查文件是否存在
if not os.path.exists(docx_path):
raise FileNotFoundError(f"文件不存在: {docx_path}")
# 设置输出文件夹
if output_folder is None:
output_folder = os.path.dirname(docx_path)
os.makedirs(output_folder, exist_ok=True)
# 创建临时文件
with tempfile.TemporaryDirectory() as temp_dir:
# 1. 将Word转换为PDF
temp_pdf = os.path.join(temp_dir, "temp.pdf")
convert(docx_path, temp_pdf)
# 2. 将PDF转换为图片
images = convert_from_path(
temp_pdf,
dpi=dpi,
output_folder=temp_dir,
fmt='jpeg'
)
# 3. 保存图片
image_paths = []
base_name = os.path.splitext(os.path.basename(docx_path))[0]
for i, image in enumerate(images):
output_path = os.path.join(output_folder, f"{base_name}_page_{i+1}.jpg")
image.save(output_path, 'JPEG', quality=95)
image_paths.append(output_path)
print(f"已保存: {output_path}")
return image_paths
# 使用示例
if __name__ == "__main__":
# 转换单个文件
image_paths = word_to_images_via_pdf("document.docx", "output_images")
# 转换多个文件
def batch_convert(folder_path):
for file in os.listdir(folder_path):
if file.endswith(('.docx', '.doc')):
docx_path = os.path.join(folder_path, file)
try:
word_to_images_via_pdf(docx_path, "converted_images")
except Exception as e:
print(f"转换失败 {file}: {e}")
方法二:使用 win32com(仅限Windows)
安装库
pip install pywin32 python-docx pillow
代码实现
import os
from docx import Document
from PIL import Image, ImageDraw
import win32com.client
import tempfile
def word_to_images_win32(docx_path, output_folder, dpi=150):
"""
使用win32com直接转换(需要安装Microsoft Word)
"""
if not os.path.exists(docx_path):
raise FileNotFoundError(f"文件不存在: {docx_path}")
os.makedirs(output_folder, exist_ok=True)
# 启动Word应用程序
word = win32com.client.Dispatch("Word.Application")
word.Visible = False
try:
# 打开文档
doc = word.Documents.Open(os.path.abspath(docx_path))
# 获取所有页数
page_count = doc.ComputeStatistics(2) # wdStatisticPages
image_paths = []
base_name = os.path.splitext(os.path.basename(docx_path))[0]
# 将每页导出为图片
for i in range(1, page_count + 1):
# 临时文件路径
temp_image = os.path.join(output_folder, f"temp_page_{i}.png")
# 导出当前页为图片
doc.ExportAsFixedFormat(
OutputFileName=temp_image,
ExportFormat=17, # wdExportFormatPDF
OpenAfterExport=False,
OptimizeFor=0, # wdExportOptimizeForPrint
Range=0, # wdExportAllDocument
From=i,
To=i,
Item=7, # wdExportDocumentWithMarkup
IncludeDocProps=True,
KeepIRM=True,
CreateBookmarks=0,
DocStructureTags=True,
BitmapMissingFonts=True,
UseISO19005_1=False
)
# 重命名文件
final_path = os.path.join(output_folder, f"{base_name}_page_{i}.png")
if os.path.exists(temp_image):
os.rename(temp_image, final_path)
image_paths.append(final_path)
print(f"已保存: {final_path}")
doc.Close()
finally:
word.Quit()
return image_paths
方法三:使用 python-docx + 手动渲染(简单文档)
from docx import Document
from PIL import Image, ImageDraw, ImageFont
import os
def simple_word_to_images(docx_path, output_folder, page_size=(2480, 3508)):
"""
简单转换:提取文本和基本格式并渲染为图片
适用于文本为主的简单文档
"""
doc = Document(docx_path)
os.makedirs(output_folder, exist_ok=True)
image_paths = []
base_name = os.path.splitext(os.path.basename(docx_path))[0]
# 每页的行数限制
lines_per_page = 50
current_page = 1
current_line = 0
# 创建新图片
img = Image.new('RGB', page_size, color='white')
draw = ImageDraw.Draw(img)
try:
# 加载字体(需要系统中存在对应字体)
title_font = ImageFont.truetype("simhei.ttf", 32) # 标题字体
text_font = ImageFont.truetype("simsun.ttc", 24) # 正文字体
y_position = 100 # 起始Y坐标
for para in doc.paragraphs:
if para.text.strip(): # 跳过空段落
# 检查是否需要新页面
if current_line >= lines_per_page:
# 保存当前页
output_path = os.path.join(output_folder, f"{base_name}_page_{current_page}.png")
img.save(output_path)
image_paths.append(output_path)
# 创建新页面
current_page += 1
current_line = 0
y_position = 100
img = Image.new('RGB', page_size, color='white')
draw = ImageDraw.Draw(img)
# 确定字体
font = title_font if para.style.name.startswith('Heading') else text_font
# 绘制文本
draw.text((100, y_position), para.text, font=font, fill='black')
y_position += 40 # 行间距
current_line += 1
# 保存最后一页
if current_line > 0:
output_path = os.path.join(output_folder, f"{base_name}_page_{current_page}.png")
img.save(output_path)
image_paths.append(output_path)
except OSError:
# 如果找不到中文字体,使用默认字体
print("警告:未找到中文字体,使用默认字体")
# 使用PIL默认字体继续处理...
return image_paths
高级功能:添加水印和优化
from PIL import Image, ImageDraw, ImageFont, ImageOps
import os
class WordToImageConverter:
def __init__(self, config=None):
self.config = config or {
'dpi': 200,
'format': 'JPEG',
'quality': 95,
'watermark': {
'text': 'CONFIDENTIAL',
'font_size': 60,
'opacity': 30,
'angle': 45
}
}
def add_watermark(self, image, text):
"""添加水印"""
watermark = Image.new('RGBA', image.size, (255, 255, 255, 0))
draw = ImageDraw.Draw(watermark)
try:
font = ImageFont.truetype("arial.ttf", self.config['watermark']['font_size'])
except:
font = ImageFont.load_default()
# 计算文本大小和位置
bbox = draw.textbbox((0, 0), text, font=font)
text_width = bbox[2] - bbox[0]
text_height = bbox[3] - bbox[1]
# 创建水印图层
for x in range(0, image.width, text_width + 100):
for y in range(0, image.height, text_height + 100):
draw.text(
(x, y),
text,
font=font,
fill=(255, 255, 255, self.config['watermark']['opacity'])
)
# 旋转水印
watermark = watermark.rotate(self.config['watermark']['angle'], expand=1)
# 合并水印
position = (
(image.width - watermark.width) // 2,
(image.height - watermark.height) // 2
)
image.paste(watermark, position, watermark)
return image
def optimize_image(self, image_path, max_size=(1920, 1080)):
"""优化图片大小"""
img = Image.open(image_path)
img.thumbnail(max_size, Image.Resampling.LANCZOS)
img.save(image_path, optimize=True, quality=85)
return img
def convert_with_enhancements(self, docx_path, output_folder):
"""增强版转换"""
# 先转换基础图片
image_paths = word_to_images_via_pdf(docx_path, output_folder)
# 处理每张图片
enhanced_paths = []
for img_path in image_paths:
# 添加水印
img = Image.open(img_path)
if self.config.get('watermark'):
img = self.add_watermark(img, self.config['watermark']['text'])
# 优化图片
if self.config.get('optimize', True):
img = self.optimize_image(img_path)
enhanced_path = img_path.replace('.jpg', '_enhanced.jpg')
img.save(enhanced_path, self.config['format'],
quality=self.config['quality'])
enhanced_paths.append(enhanced_path)
return enhanced_paths
# 使用高级转换器
converter = WordToImageConverter()
converter.convert_with_enhancements("document.docx", "output")
批处理和监控脚本
import os
import time
import json
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler
class WordFileHandler(FileSystemEventHandler):
def __init__(self, converter, output_base="converted"):
self.converter = converter
self.output_base = output_base
os.makedirs(output_base, exist_ok=True)
def on_created(self, event):
if not event.is_directory:
file_path = event.src_path
if file_path.lower().endswith(('.docx', '.doc')):
print(f"检测到新文件: {file_path}")
try:
# 为每个文件创建单独的输出文件夹
file_name = os.path.splitext(os.path.basename(file_path))[0]
output_folder = os.path.join(self.output_base, file_name)
# 转换文档
self.converter.convert_with_enhancements(
file_path,
output_folder
)
# 记录日志
log_entry = {
'file': file_path,
'time': time.strftime('%Y-%m-%d %H:%M:%S'),
'status': 'success',
'output': output_folder
}
with open('conversion_log.json', 'a') as f:
json.dump(log_entry, f)
f.write('\n')
print(f"✓ 转换完成: {file_path}")
except Exception as e:
print(f"✗ 转换失败: {e}")
def start_folder_monitor(watch_folder, converter):
"""监控文件夹自动转换"""
event_handler = WordFileHandler(converter)
observer = Observer()
observer.schedule(event_handler, watch_folder, recursive=False)
observer.start()
print(f"开始监控文件夹: {watch_folder}")
print("按 Ctrl+C 停止监控")
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
observer.stop()
observer.join()
# 启动监控
converter = WordToImageConverter()
start_folder_monitor("./watch_folder", converter)
Docker容器化部署
# Dockerfile
FROM python:3.9-slim
# 安装依赖
RUN apt-get update && apt-get install -y \
poppler-utils \
fonts-liberation \
fonts-wqy-zenhei \
fonts-wqy-microhei \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /app
# 复制代码和依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY word_converter.py .
# 创建监控文件夹
RUN mkdir -p /app/input /app/output
CMD ["python", "word_converter.py", "--watch", "/app/input"]
性能优化建议
并行处理:
from concurrent.futures import ThreadPoolExecutor
import concurrent.futures
def batch_convert_parallel(docx_files, output_base, max_workers=4):
"""并行批量转换"""
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = []
for docx_file in docx_files:
output_folder = os.path.join(output_base,
os.path.splitext(os.path.basename(docx_file))[0])
future = executor.submit(word_to_images_via_pdf, docx_file, output_folder)
futures.append(future)
# 等待所有任务完成
for future in concurrent.futures.as_completed(futures):
try:
result = future.result()
print(f"完成: {len(result)} 张图片")
except Exception as e:
print(f"错误: {e}")
2. **缓存已处理文件**:
```python
import hashlib
import pickle
class ConversionCache:
def __init__(self, cache_file="conversion_cache.pkl"):
self.cache_file = cache_file
self.cache = self.load_cache()
def get_file_hash(self, file_path):
"""计算文件哈希值"""
hasher = hashlib.md5()
with open(file_path, 'rb') as f:
buf = f.read()
hasher.update(buf)
return hasher.hexdigest()
def load_cache(self):
"""加载缓存"""
try:
with open(self.cache_file, 'rb') as f:
return pickle.load(f)
except:
return {}
def save_cache(self):
"""保存缓存"""
with open(self.cache_file, 'wb') as f:
pickle.dump(self.cache, f)
def needs_conversion(self, file_path):
"""检查是否需要转换"""
file_hash = self.get_file_hash(file_path)
last_modified = os.path.getmtime(file_path)
if file_path in self.cache:
cached_info = self.cache[file_path]
if (cached_info['hash'] == file_hash and
cached_info['modified'] == last_modified):
return False
self.cache[file_path] = {
'hash': file_hash,
'modified': last_modified
}
return True
总结
推荐方案
跨平台方案:方法一(docx2pdf + pdf2image)
Windows专用:方法二(win32com)
简单需求:方法三(python-docx渲染)
注意事项
确保有足够的磁盘空间存储临时文件
高DPI设置会产生大文件,适当调整
处理中文文档时确保系统有中文字体
考虑使用异常处理确保程序健壮性
这个指南提供了从基础到高级的完整解决方案,您可以根据具体需求选择合适的方案。