梧州市葬花

Python自动化实现Word到图片的转换指南

2026-03-28 11:40:02 浏览次数:1
详细信息

方法一:使用 docx2pdfpdf2image(推荐)

安装所需库

pip install python-docx docx2pdf pdf2image pillow
# 还需要安装poppler(Windows用户)
# Windows下载:https://github.com/oschwartz10612/poppler-windows/releases

完整代码实现

import os
from docx2pdf import convert
from pdf2image import convert_from_path
from PIL import Image
import tempfile

def word_to_images_via_pdf(docx_path, output_folder=None, dpi=200):
    """
    通过PDF中间格式将Word转换为图片

    参数:
        docx_path: Word文档路径
        output_folder: 输出文件夹(默认与文档同目录)
        dpi: 图片分辨率
    """
    # 检查文件是否存在
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"文件不存在: {docx_path}")

    # 设置输出文件夹
    if output_folder is None:
        output_folder = os.path.dirname(docx_path)

    os.makedirs(output_folder, exist_ok=True)

    # 创建临时文件
    with tempfile.TemporaryDirectory() as temp_dir:
        # 1. 将Word转换为PDF
        temp_pdf = os.path.join(temp_dir, "temp.pdf")
        convert(docx_path, temp_pdf)

        # 2. 将PDF转换为图片
        images = convert_from_path(
            temp_pdf, 
            dpi=dpi, 
            output_folder=temp_dir,
            fmt='jpeg'
        )

        # 3. 保存图片
        image_paths = []
        base_name = os.path.splitext(os.path.basename(docx_path))[0]

        for i, image in enumerate(images):
            output_path = os.path.join(output_folder, f"{base_name}_page_{i+1}.jpg")
            image.save(output_path, 'JPEG', quality=95)
            image_paths.append(output_path)
            print(f"已保存: {output_path}")

    return image_paths

# 使用示例
if __name__ == "__main__":
    # 转换单个文件
    image_paths = word_to_images_via_pdf("document.docx", "output_images")

    # 转换多个文件
    def batch_convert(folder_path):
        for file in os.listdir(folder_path):
            if file.endswith(('.docx', '.doc')):
                docx_path = os.path.join(folder_path, file)
                try:
                    word_to_images_via_pdf(docx_path, "converted_images")
                except Exception as e:
                    print(f"转换失败 {file}: {e}")

方法二:使用 win32com(仅限Windows)

安装库

pip install pywin32 python-docx pillow

代码实现

import os
from docx import Document
from PIL import Image, ImageDraw
import win32com.client
import tempfile

def word_to_images_win32(docx_path, output_folder, dpi=150):
    """
    使用win32com直接转换(需要安装Microsoft Word)
    """
    if not os.path.exists(docx_path):
        raise FileNotFoundError(f"文件不存在: {docx_path}")

    os.makedirs(output_folder, exist_ok=True)

    # 启动Word应用程序
    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False

    try:
        # 打开文档
        doc = word.Documents.Open(os.path.abspath(docx_path))

        # 获取所有页数
        page_count = doc.ComputeStatistics(2)  # wdStatisticPages

        image_paths = []
        base_name = os.path.splitext(os.path.basename(docx_path))[0]

        # 将每页导出为图片
        for i in range(1, page_count + 1):
            # 临时文件路径
            temp_image = os.path.join(output_folder, f"temp_page_{i}.png")

            # 导出当前页为图片
            doc.ExportAsFixedFormat(
                OutputFileName=temp_image,
                ExportFormat=17,  # wdExportFormatPDF
                OpenAfterExport=False,
                OptimizeFor=0,    # wdExportOptimizeForPrint
                Range=0,          # wdExportAllDocument
                From=i,
                To=i,
                Item=7,           # wdExportDocumentWithMarkup
                IncludeDocProps=True,
                KeepIRM=True,
                CreateBookmarks=0,
                DocStructureTags=True,
                BitmapMissingFonts=True,
                UseISO19005_1=False
            )

            # 重命名文件
            final_path = os.path.join(output_folder, f"{base_name}_page_{i}.png")
            if os.path.exists(temp_image):
                os.rename(temp_image, final_path)
                image_paths.append(final_path)
                print(f"已保存: {final_path}")

        doc.Close()

    finally:
        word.Quit()

    return image_paths

方法三:使用 python-docx + 手动渲染(简单文档)

from docx import Document
from PIL import Image, ImageDraw, ImageFont
import os

def simple_word_to_images(docx_path, output_folder, page_size=(2480, 3508)):
    """
    简单转换:提取文本和基本格式并渲染为图片
    适用于文本为主的简单文档
    """
    doc = Document(docx_path)
    os.makedirs(output_folder, exist_ok=True)

    image_paths = []
    base_name = os.path.splitext(os.path.basename(docx_path))[0]

    # 每页的行数限制
    lines_per_page = 50
    current_page = 1
    current_line = 0

    # 创建新图片
    img = Image.new('RGB', page_size, color='white')
    draw = ImageDraw.Draw(img)

    try:
        # 加载字体(需要系统中存在对应字体)
        title_font = ImageFont.truetype("simhei.ttf", 32)  # 标题字体
        text_font = ImageFont.truetype("simsun.ttc", 24)   # 正文字体

        y_position = 100  # 起始Y坐标

        for para in doc.paragraphs:
            if para.text.strip():  # 跳过空段落
                # 检查是否需要新页面
                if current_line >= lines_per_page:
                    # 保存当前页
                    output_path = os.path.join(output_folder, f"{base_name}_page_{current_page}.png")
                    img.save(output_path)
                    image_paths.append(output_path)

                    # 创建新页面
                    current_page += 1
                    current_line = 0
                    y_position = 100
                    img = Image.new('RGB', page_size, color='white')
                    draw = ImageDraw.Draw(img)

                # 确定字体
                font = title_font if para.style.name.startswith('Heading') else text_font

                # 绘制文本
                draw.text((100, y_position), para.text, font=font, fill='black')

                y_position += 40  # 行间距
                current_line += 1

        # 保存最后一页
        if current_line > 0:
            output_path = os.path.join(output_folder, f"{base_name}_page_{current_page}.png")
            img.save(output_path)
            image_paths.append(output_path)

    except OSError:
        # 如果找不到中文字体,使用默认字体
        print("警告:未找到中文字体,使用默认字体")
        # 使用PIL默认字体继续处理...

    return image_paths

高级功能:添加水印和优化

from PIL import Image, ImageDraw, ImageFont, ImageOps
import os

class WordToImageConverter:
    def __init__(self, config=None):
        self.config = config or {
            'dpi': 200,
            'format': 'JPEG',
            'quality': 95,
            'watermark': {
                'text': 'CONFIDENTIAL',
                'font_size': 60,
                'opacity': 30,
                'angle': 45
            }
        }

    def add_watermark(self, image, text):
        """添加水印"""
        watermark = Image.new('RGBA', image.size, (255, 255, 255, 0))
        draw = ImageDraw.Draw(watermark)

        try:
            font = ImageFont.truetype("arial.ttf", self.config['watermark']['font_size'])
        except:
            font = ImageFont.load_default()

        # 计算文本大小和位置
        bbox = draw.textbbox((0, 0), text, font=font)
        text_width = bbox[2] - bbox[0]
        text_height = bbox[3] - bbox[1]

        # 创建水印图层
        for x in range(0, image.width, text_width + 100):
            for y in range(0, image.height, text_height + 100):
                draw.text(
                    (x, y),
                    text,
                    font=font,
                    fill=(255, 255, 255, self.config['watermark']['opacity'])
                )

        # 旋转水印
        watermark = watermark.rotate(self.config['watermark']['angle'], expand=1)

        # 合并水印
        position = (
            (image.width - watermark.width) // 2,
            (image.height - watermark.height) // 2
        )

        image.paste(watermark, position, watermark)
        return image

    def optimize_image(self, image_path, max_size=(1920, 1080)):
        """优化图片大小"""
        img = Image.open(image_path)
        img.thumbnail(max_size, Image.Resampling.LANCZOS)
        img.save(image_path, optimize=True, quality=85)
        return img

    def convert_with_enhancements(self, docx_path, output_folder):
        """增强版转换"""
        # 先转换基础图片
        image_paths = word_to_images_via_pdf(docx_path, output_folder)

        # 处理每张图片
        enhanced_paths = []
        for img_path in image_paths:
            # 添加水印
            img = Image.open(img_path)
            if self.config.get('watermark'):
                img = self.add_watermark(img, self.config['watermark']['text'])

            # 优化图片
            if self.config.get('optimize', True):
                img = self.optimize_image(img_path)

            enhanced_path = img_path.replace('.jpg', '_enhanced.jpg')
            img.save(enhanced_path, self.config['format'], 
                    quality=self.config['quality'])
            enhanced_paths.append(enhanced_path)

        return enhanced_paths

# 使用高级转换器
converter = WordToImageConverter()
converter.convert_with_enhancements("document.docx", "output")

批处理和监控脚本

import os
import time
import json
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

class WordFileHandler(FileSystemEventHandler):
    def __init__(self, converter, output_base="converted"):
        self.converter = converter
        self.output_base = output_base
        os.makedirs(output_base, exist_ok=True)

    def on_created(self, event):
        if not event.is_directory:
            file_path = event.src_path
            if file_path.lower().endswith(('.docx', '.doc')):
                print(f"检测到新文件: {file_path}")
                try:
                    # 为每个文件创建单独的输出文件夹
                    file_name = os.path.splitext(os.path.basename(file_path))[0]
                    output_folder = os.path.join(self.output_base, file_name)

                    # 转换文档
                    self.converter.convert_with_enhancements(
                        file_path, 
                        output_folder
                    )

                    # 记录日志
                    log_entry = {
                        'file': file_path,
                        'time': time.strftime('%Y-%m-%d %H:%M:%S'),
                        'status': 'success',
                        'output': output_folder
                    }

                    with open('conversion_log.json', 'a') as f:
                        json.dump(log_entry, f)
                        f.write('\n')

                    print(f"✓ 转换完成: {file_path}")

                except Exception as e:
                    print(f"✗ 转换失败: {e}")

def start_folder_monitor(watch_folder, converter):
    """监控文件夹自动转换"""
    event_handler = WordFileHandler(converter)
    observer = Observer()
    observer.schedule(event_handler, watch_folder, recursive=False)
    observer.start()

    print(f"开始监控文件夹: {watch_folder}")
    print("按 Ctrl+C 停止监控")

    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        observer.stop()

    observer.join()

# 启动监控
converter = WordToImageConverter()
start_folder_monitor("./watch_folder", converter)

Docker容器化部署

# Dockerfile
FROM python:3.9-slim

# 安装依赖
RUN apt-get update && apt-get install -y \
    poppler-utils \
    fonts-liberation \
    fonts-wqy-zenhei \
    fonts-wqy-microhei \
    && rm -rf /var/lib/apt/lists/*

WORKDIR /app

# 复制代码和依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY word_converter.py .

# 创建监控文件夹
RUN mkdir -p /app/input /app/output

CMD ["python", "word_converter.py", "--watch", "/app/input"]

性能优化建议

并行处理

from concurrent.futures import ThreadPoolExecutor
import concurrent.futures

def batch_convert_parallel(docx_files, output_base, max_workers=4): """并行批量转换""" with ThreadPoolExecutor(max_workers=max_workers) as executor: futures = [] for docx_file in docx_files: output_folder = os.path.join(output_base, os.path.splitext(os.path.basename(docx_file))[0]) future = executor.submit(word_to_images_via_pdf, docx_file, output_folder) futures.append(future)

    # 等待所有任务完成
    for future in concurrent.futures.as_completed(futures):
        try:
            result = future.result()
            print(f"完成: {len(result)} 张图片")
        except Exception as e:
            print(f"错误: {e}")

2. **缓存已处理文件**:
```python
import hashlib
import pickle

class ConversionCache:
    def __init__(self, cache_file="conversion_cache.pkl"):
        self.cache_file = cache_file
        self.cache = self.load_cache()

    def get_file_hash(self, file_path):
        """计算文件哈希值"""
        hasher = hashlib.md5()
        with open(file_path, 'rb') as f:
            buf = f.read()
            hasher.update(buf)
        return hasher.hexdigest()

    def load_cache(self):
        """加载缓存"""
        try:
            with open(self.cache_file, 'rb') as f:
                return pickle.load(f)
        except:
            return {}

    def save_cache(self):
        """保存缓存"""
        with open(self.cache_file, 'wb') as f:
            pickle.dump(self.cache, f)

    def needs_conversion(self, file_path):
        """检查是否需要转换"""
        file_hash = self.get_file_hash(file_path)
        last_modified = os.path.getmtime(file_path)

        if file_path in self.cache:
            cached_info = self.cache[file_path]
            if (cached_info['hash'] == file_hash and 
                cached_info['modified'] == last_modified):
                return False

        self.cache[file_path] = {
            'hash': file_hash,
            'modified': last_modified
        }
        return True

总结

推荐方案

跨平台方案:方法一(docx2pdf + pdf2image) Windows专用:方法二(win32com) 简单需求:方法三(python-docx渲染)

注意事项

确保有足够的磁盘空间存储临时文件 高DPI设置会产生大文件,适当调整 处理中文文档时确保系统有中文字体 考虑使用异常处理确保程序健壮性

这个指南提供了从基础到高级的完整解决方案,您可以根据具体需求选择合适的方案。

相关推荐