feishu_fabu/modules/wp_parse_docx.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WordPress 发布系统 - Word 文档解析模块
解析 .docx 文件，提取标题、正文、图片等元素
"""

import os
import re
import base64
import hashlib
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH

from modules.wp_logger import get_publish_logger, get_debug_logger

# 基础目录
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TEMP_DIR = os.path.join(BASE_DIR, 'temp')
os.makedirs(TEMP_DIR, exist_ok=True)


class WordParser:
    """Word 文档解析器"""

    def __init__(self, file_path):
        """
        初始化解析器

        Args:
            file_path: Word 文档路径
        """
        self.file_path = file_path
        self.filename = os.path.basename(file_path)
        self.doc = None
        self.title = ""
        self.content_parts = []
        self.images = []
        self.metadata = {
            'paragraph_count': 0,
            'image_count': 0,
            'word_count': 0
        }

        self.pl = get_publish_logger()
        self.dl = get_debug_logger()

    def parse(self):
        """
        解析 Word 文档

        Returns:
            dict: 包含 title, content, images 的字典
        """
        self.pl.info(f"📖 开始解析文档：{self.filename}")

        try:
            # 加载文档
            self.doc = Document(self.file_path)
            self.dl.log_step("加载文档", f"成功加载：{self.file_path}")

            # 提取标题
            self._extract_title()

            # 提取内容
            self._extract_content()

            # 提取图片
            self._extract_images()

            # 统计信息
            self._update_metadata()

            self.pl.success(f"文档解析完成 - 标题：{self.title}，段落数：{self.metadata['paragraph_count']}，图片数：{self.metadata['image_count']}")

            return {
                'title': self.title,
                'content': self.content_parts,
                'images': self.images,
                'metadata': self.metadata
            }

        except Exception as e:
            self.pl.error(f"文档解析失败：{str(e)}")
            self.dl.error(f"解析异常：{str(e)}", exc_info=True)
            raise

    def _extract_title(self):
        """提取文档标题"""
        self.dl.log_step("提取标题")

        # 方法 1：从文档属性获取
        if self.doc.core_properties.title:
            self.title = self.doc.core_properties.title.strip()
            self.dl.debug(f"从文档属性获取标题：{self.title}")
            return

        # 方法 2：从第一个标题样式段落获取
        for paragraph in self.doc.paragraphs:
            if paragraph.style.name.startswith('Heading'):
                self.title = paragraph.text.strip()
                self.dl.debug(f"从标题样式获取标题：{self.title}")
                return

        # 方法 3：从第一个加粗大字号段落获取
        for paragraph in self.doc.paragraphs:
            if paragraph.text.strip() and self._is_title_style(paragraph):
                self.title = paragraph.text.strip()
                self.dl.debug(f"从样式特征获取标题：{self.title}")
                return

        # 方法 4：使用文件名作为标题
        self.title = os.path.splitext(self.filename)[0]
        self.dl.warning(f"使用文件名作为标题：{self.title}")

    def _extract_content(self):
        """提取文档内容（段落、列表等）"""
        self.dl.log_step("提取内容")

        content_html = []
        in_list = False
        list_type = None
        list_items = []

        for i, paragraph in enumerate(self.doc.paragraphs):
            text = paragraph.text.strip()

            # 跳过空段落
            if not text:
                if in_list and list_items:
                    content_html.extend(self._close_list(list_type, list_items))
                    list_items = []
                    in_list = False
                continue

            # 检测是否为标题
            if paragraph.style.name.startswith('Heading 1') or self._is_heading(paragraph, 1):
                if in_list and list_items:
                    content_html.extend(self._close_list(list_type, list_items))
                    list_items = []
                    in_list = False
                content_html.append(f'<h2>{self._escape_html(text)}</h2>')
                self.dl.debug(f"段落 {i}: H2 标题")
                continue

            if paragraph.style.name.startswith('Heading 2') or self._is_heading(paragraph, 2):
                if in_list and list_items:
                    content_html.extend(self._close_list(list_type, list_items))
                    list_items = []
                    in_list = False
                content_html.append(f'<h3>{self._escape_html(text)}</h3>')
                self.dl.debug(f"段落 {i}: H3 标题")
                continue

            # 检测是否为列表项
            if self._is_list_item(paragraph):
                if not in_list:
                    in_list = True
                    list_type = 'ol' if self._is_numbered_list(paragraph) else 'ul'
                    list_items = []

                # 清理列表标记
                clean_text = self._clean_list_marker(text)
                list_items.append(f'<li>{self._format_run_styles(clean_text, paragraph)}</li>')
                self.dl.debug(f"段落 {i}: 列表项")
                continue

            # 普通段落
            if in_list and list_items:
                content_html.extend(self._close_list(list_type, list_items))
                list_items = []
                in_list = False

            # 处理加粗文本
            formatted_text = self._format_run_styles(text, paragraph)
            content_html.append(f'<p>{formatted_text}</p>')
            self.dl.debug(f"段落 {i}: 普通段落")

        # 关闭最后的列表
        if in_list and list_items:
            content_html.extend(self._close_list(list_type, list_items))

        self.content_parts = content_html

    def _extract_images(self):
        """提取文档中的图片"""
        self.dl.log_step("提取图片")

        image_index = 0

        for i, paragraph in enumerate(self.doc.paragraphs):
            # 检查段落中的图片
            for run in paragraph.runs:
                if run._element.xml.find('pic:pic') != -1 or run._element.xml.find('w:binData') != -1:
                    image_data = self._extract_image_from_run(run)
                    if image_data:
                        image_index += 1
                        self.images.append({
                            'index': image_index,
                            'paragraph_index': i,
                            'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
                            'data': image_data['data'],
                            'format': image_data['format']
                        })
                        self.dl.debug(f"提取图片 {image_index}：{self.images[-1]['filename']}")

        # 也检查文档关系中的图片
        for rel in self.doc.part.rels.values():
            if "image" in rel.target_ref:
                image_index += 1
                image_data = self._extract_image_from_rel(rel)
                if image_data:
                    self.images.append({
                        'index': image_index,
                        'paragraph_index': -1,
                        'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
                        'data': image_data['data'],
                        'format': image_data['format']
                    })
                    self.dl.debug(f"提取图片 {image_index}（从关系）：{self.images[-1]['filename']}")

    def _update_metadata(self):
        """更新文档元数据"""
        self.metadata['paragraph_count'] = len(self.doc.paragraphs)
        self.metadata['image_count'] = len(self.images)

        # 粗略计算字数
        word_count = sum(len(p.text) for p in self.doc.paragraphs)
        self.metadata['word_count'] = word_count

    # ========== 辅助方法 ==========

    def _is_title_style(self, paragraph):
        """判断段落是否为标题样式"""
        if not paragraph.runs:
            return False

        first_run = paragraph.runs[0]
        if first_run.font.size:
            size = first_run.font.size.pt
            if size and size >= 16:
                return True

        if first_run.font.bold:
            return True

        return False

    def _is_heading(self, paragraph, level):
        """判断段落是否为指定级别的标题"""
        if level == 1:
            return (paragraph.runs and
                    paragraph.runs[0].font.size and
                    paragraph.runs[0].font.size.pt >= 18 and
                    paragraph.runs[0].font.bold)
        elif level == 2:
            return (paragraph.runs and
                    paragraph.runs[0].font.size and
                    paragraph.runs[0].font.size.pt >= 14 and
                    paragraph.runs[0].font.bold)
        return False

    def _is_list_item(self, paragraph):
        """判断段落是否为列表项"""
        text = paragraph.text.strip()
        if not text:
            return False

        # 检查项目符号
        if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']:
            return True

        # 检查编号
        if re.match(r'^\d+[\.\、\)]', text):
            return True
        if re.match(r'^[a-zA-Z][\.\、\)]', text):
            return True
        if re.match(r'^[（\(]\d+[）\)]', text):
            return True

        # 检查样式
        if 'List' in paragraph.style.name:
            return True

        return False

    def _is_numbered_list(self, paragraph):
        """判断是否为编号列表"""
        text = paragraph.text.strip()
        return bool(re.match(r'^\d+[\.\、\)]', text) or
                   re.match(r'^[a-zA-Z][\.\、\)]', text))

    def _clean_list_marker(self, text):
        """清理列表标记"""
        # 移除项目符号
        if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']:
            text = text[1:].strip()

        # 移除编号
        text = re.sub(r'^\d+[\.\、\)]\s*', '', text)
        text = re.sub(r'^[a-zA-Z][\.\、\)]\s*', '', text)
        text = re.sub(r'^[（\(]\d+[）\)]\s*', '', text)

        return text.strip()

    def _format_run_styles(self, text, paragraph):
        """格式化文本样式（加粗、斜体等）"""
        if not paragraph.runs:
            return self._escape_html(text)

        result = []
        for run in paragraph.runs:
            run_text = run.text
            if not run_text:
                continue

            # 加粗
            if run.font.bold:
                run_text = f'<strong>{self._escape_html(run_text)}</strong>'
            else:
                run_text = self._escape_html(run_text)

            # 斜体
            if run.font.italic:
                run_text = f'<em>{run_text}</em>'

            result.append(run_text)

        return ''.join(result) if result else self._escape_html(text)

    def _close_list(self, list_type, items):
        """关闭列表标签"""
        if not items:
            return []
        return [f'<{list_type}>{"".join(items)}</{list_type}>']

    def _escape_html(self, text):
        """转义 HTML 特殊字符"""
        text = text.replace('&', '&amp;')
        text = text.replace('<', '&lt;')
        text = text.replace('>', '&gt;')
        text = text.replace('"', '&quot;')
        return text

    def _extract_image_from_run(self, run):
        """从 run 中提取图片"""
        try:
            # 获取图片二进制数据
            xml = run._element.xml
            if 'w:binData' in xml:
                import xml.etree.ElementTree as ET
                from io import BytesIO

                # 解析 XML 获取图片数据
                root = ET.fromstring(xml)
                for elem in root.iter():
                    if 'binData' in elem.tag and elem.text:
                        image_data = base64.b64decode(elem.text)
                        image_hash = hashlib.md5(image_data).hexdigest()

                        # 检测图片格式
                        fmt = self._detect_image_format(image_data)

                        return {
                            'data': image_data,
                            'format': fmt,
                            'hash': image_hash
                        }
        except Exception as e:
            self.dl.error(f"提取图片失败：{str(e)}")

        return None

    def _extract_image_from_rel(self, rel):
        """从关系中提取图片"""
        try:
            image_part = rel.target_part
            image_data = image_part.blob
            image_hash = hashlib.md5(image_data).hexdigest()

            # 获取格式
            content_type = image_part.content_type
            fmt = self._content_type_to_format(content_type)

            return {
                'data': image_data,
                'format': fmt,
                'hash': image_hash
            }
        except Exception as e:
            self.dl.error(f"从关系提取图片失败：{str(e)}")

        return None

    def _detect_image_format(self, data):
        """检测图片格式"""
        if data[:3] == b'\xff\xd8\xff':
            return 'jpg'
        elif data[:8] == b'\x89PNG\r\n\x1a\n':
            return 'png'
        elif data[:6] in (b'GIF87a', b'GIF89a'):
            return 'gif'
        elif data[:2] == b'BM':
            return 'bmp'
        return 'jpg'  # 默认为 jpg

    def _content_type_to_format(self, content_type):
        """将 content_type 转换为格式"""
        format_map = {
            'image/jpeg': 'jpg',
            'image/png': 'png',
            'image/gif': 'gif',
            'image/bmp': 'bmp',
            'image/webp': 'webp'
        }
        return format_map.get(content_type, 'jpg')


def parse_word_file(file_path):
    """
    解析 Word 文件的便捷函数

    Args:
        file_path: Word 文档路径

    Returns:
        dict: 解析结果
    """
    parser = WordParser(file_path)
    return parser.parse()


if __name__ == '__main__':
    import sys
    if len(sys.argv) < 2:
        print("用法：python wp_parse_docx.py <word 文件路径>")
        sys.exit(1)

    result = parse_word_file(sys.argv[1])
    print(f"标题：{result['title']}")
    print(f"段落数：{result['metadata']['paragraph_count']}")
    print(f"图片数：{result['metadata']['image_count']}")
    print(f"字数：{result['metadata']['word_count']}")
    print(f"\nHTML 内容：\n{''.join(result['content'])}")