#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ WordPress 发布系统 - Word 文档解析模块 解析 .docx 文件,提取标题、正文、图片等元素 """ import os import re import base64 import hashlib from docx import Document from docx.shared import Pt, Inches from docx.enum.text import WD_ALIGN_PARAGRAPH from modules.wp_logger import get_publish_logger, get_debug_logger # 基础目录 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TEMP_DIR = os.path.join(BASE_DIR, 'temp') os.makedirs(TEMP_DIR, exist_ok=True) class WordParser: """Word 文档解析器""" def __init__(self, file_path): """ 初始化解析器 Args: file_path: Word 文档路径 """ self.file_path = file_path self.filename = os.path.basename(file_path) self.doc = None self.title = "" self.content_parts = [] self.images = [] self.metadata = { 'paragraph_count': 0, 'image_count': 0, 'word_count': 0 } self.pl = get_publish_logger() self.dl = get_debug_logger() def parse(self): """ 解析 Word 文档 Returns: dict: 包含 title, content, images 的字典 """ self.pl.info(f"📖 开始解析文档:{self.filename}") try: # 加载文档 self.doc = Document(self.file_path) self.dl.log_step("加载文档", f"成功加载:{self.file_path}") # 提取标题 self._extract_title() # 提取内容 self._extract_content() # 提取图片 self._extract_images() # 统计信息 self._update_metadata() self.pl.success(f"文档解析完成 - 标题:{self.title},段落数:{self.metadata['paragraph_count']},图片数:{self.metadata['image_count']}") return { 'title': self.title, 'content': self.content_parts, 'images': self.images, 'metadata': self.metadata } except Exception as e: self.pl.error(f"文档解析失败:{str(e)}") self.dl.error(f"解析异常:{str(e)}", exc_info=True) raise def _extract_title(self): """提取文档标题""" self.dl.log_step("提取标题") # 方法 1:从文档属性获取 if self.doc.core_properties.title: self.title = self.doc.core_properties.title.strip() self.dl.debug(f"从文档属性获取标题:{self.title}") return # 方法 2:从第一个标题样式段落获取 for paragraph in self.doc.paragraphs: if paragraph.style.name.startswith('Heading'): self.title = paragraph.text.strip() self.dl.debug(f"从标题样式获取标题:{self.title}") return # 方法 3:从第一个加粗大字号段落获取 for paragraph in self.doc.paragraphs: if paragraph.text.strip() and self._is_title_style(paragraph): self.title = paragraph.text.strip() self.dl.debug(f"从样式特征获取标题:{self.title}") return # 方法 4:使用文件名作为标题 self.title = os.path.splitext(self.filename)[0] self.dl.warning(f"使用文件名作为标题:{self.title}") def _extract_content(self): """提取文档内容(段落、列表等)""" self.dl.log_step("提取内容") content_html = [] in_list = False list_type = None list_items = [] for i, paragraph in enumerate(self.doc.paragraphs): text = paragraph.text.strip() # 跳过空段落 if not text: if in_list and list_items: content_html.extend(self._close_list(list_type, list_items)) list_items = [] in_list = False continue # 检测是否为标题 if paragraph.style.name.startswith('Heading 1') or self._is_heading(paragraph, 1): if in_list and list_items: content_html.extend(self._close_list(list_type, list_items)) list_items = [] in_list = False content_html.append(f'
{formatted_text}
') self.dl.debug(f"段落 {i}: 普通段落") # 关闭最后的列表 if in_list and list_items: content_html.extend(self._close_list(list_type, list_items)) self.content_parts = content_html def _extract_images(self): """提取文档中的图片""" self.dl.log_step("提取图片") image_index = 0 for i, paragraph in enumerate(self.doc.paragraphs): # 检查段落中的图片 for run in paragraph.runs: if run._element.xml.find('pic:pic') != -1 or run._element.xml.find('w:binData') != -1: image_data = self._extract_image_from_run(run) if image_data: image_index += 1 self.images.append({ 'index': image_index, 'paragraph_index': i, 'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}", 'data': image_data['data'], 'format': image_data['format'] }) self.dl.debug(f"提取图片 {image_index}:{self.images[-1]['filename']}") # 也检查文档关系中的图片 for rel in self.doc.part.rels.values(): if "image" in rel.target_ref: image_index += 1 image_data = self._extract_image_from_rel(rel) if image_data: self.images.append({ 'index': image_index, 'paragraph_index': -1, 'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}", 'data': image_data['data'], 'format': image_data['format'] }) self.dl.debug(f"提取图片 {image_index}(从关系):{self.images[-1]['filename']}") def _update_metadata(self): """更新文档元数据""" self.metadata['paragraph_count'] = len(self.doc.paragraphs) self.metadata['image_count'] = len(self.images) # 粗略计算字数 word_count = sum(len(p.text) for p in self.doc.paragraphs) self.metadata['word_count'] = word_count # ========== 辅助方法 ========== def _is_title_style(self, paragraph): """判断段落是否为标题样式""" if not paragraph.runs: return False first_run = paragraph.runs[0] if first_run.font.size: size = first_run.font.size.pt if size and size >= 16: return True if first_run.font.bold: return True return False def _is_heading(self, paragraph, level): """判断段落是否为指定级别的标题""" if level == 1: return (paragraph.runs and paragraph.runs[0].font.size and paragraph.runs[0].font.size.pt >= 18 and paragraph.runs[0].font.bold) elif level == 2: return (paragraph.runs and paragraph.runs[0].font.size and paragraph.runs[0].font.size.pt >= 14 and paragraph.runs[0].font.bold) return False def _is_list_item(self, paragraph): """判断段落是否为列表项""" text = paragraph.text.strip() if not text: return False # 检查项目符号 if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']: return True # 检查编号 if re.match(r'^\d+[\.\、\)]', text): return True if re.match(r'^[a-zA-Z][\.\、\)]', text): return True if re.match(r'^[(\(]\d+[)\)]', text): return True # 检查样式 if 'List' in paragraph.style.name: return True return False def _is_numbered_list(self, paragraph): """判断是否为编号列表""" text = paragraph.text.strip() return bool(re.match(r'^\d+[\.\、\)]', text) or re.match(r'^[a-zA-Z][\.\、\)]', text)) def _clean_list_marker(self, text): """清理列表标记""" # 移除项目符号 if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']: text = text[1:].strip() # 移除编号 text = re.sub(r'^\d+[\.\、\)]\s*', '', text) text = re.sub(r'^[a-zA-Z][\.\、\)]\s*', '', text) text = re.sub(r'^[(\(]\d+[)\)]\s*', '', text) return text.strip() def _format_run_styles(self, text, paragraph): """格式化文本样式(加粗、斜体等)""" if not paragraph.runs: return self._escape_html(text) result = [] for run in paragraph.runs: run_text = run.text if not run_text: continue # 加粗 if run.font.bold: run_text = f'{self._escape_html(run_text)}' else: run_text = self._escape_html(run_text) # 斜体 if run.font.italic: run_text = f'{run_text}' result.append(run_text) return ''.join(result) if result else self._escape_html(text) def _close_list(self, list_type, items): """关闭列表标签""" if not items: return [] return [f'<{list_type}>{"".join(items)}{list_type}>'] def _escape_html(self, text): """转义 HTML 特殊字符""" text = text.replace('&', '&') text = text.replace('<', '<') text = text.replace('>', '>') text = text.replace('"', '"') return text def _extract_image_from_run(self, run): """从 run 中提取图片""" try: # 获取图片二进制数据 xml = run._element.xml if 'w:binData' in xml: import xml.etree.ElementTree as ET from io import BytesIO # 解析 XML 获取图片数据 root = ET.fromstring(xml) for elem in root.iter(): if 'binData' in elem.tag and elem.text: image_data = base64.b64decode(elem.text) image_hash = hashlib.md5(image_data).hexdigest() # 检测图片格式 fmt = self._detect_image_format(image_data) return { 'data': image_data, 'format': fmt, 'hash': image_hash } except Exception as e: self.dl.error(f"提取图片失败:{str(e)}") return None def _extract_image_from_rel(self, rel): """从关系中提取图片""" try: image_part = rel.target_part image_data = image_part.blob image_hash = hashlib.md5(image_data).hexdigest() # 获取格式 content_type = image_part.content_type fmt = self._content_type_to_format(content_type) return { 'data': image_data, 'format': fmt, 'hash': image_hash } except Exception as e: self.dl.error(f"从关系提取图片失败:{str(e)}") return None def _detect_image_format(self, data): """检测图片格式""" if data[:3] == b'\xff\xd8\xff': return 'jpg' elif data[:8] == b'\x89PNG\r\n\x1a\n': return 'png' elif data[:6] in (b'GIF87a', b'GIF89a'): return 'gif' elif data[:2] == b'BM': return 'bmp' return 'jpg' # 默认为 jpg def _content_type_to_format(self, content_type): """将 content_type 转换为格式""" format_map = { 'image/jpeg': 'jpg', 'image/png': 'png', 'image/gif': 'gif', 'image/bmp': 'bmp', 'image/webp': 'webp' } return format_map.get(content_type, 'jpg') def parse_word_file(file_path): """ 解析 Word 文件的便捷函数 Args: file_path: Word 文档路径 Returns: dict: 解析结果 """ parser = WordParser(file_path) return parser.parse() if __name__ == '__main__': import sys if len(sys.argv) < 2: print("用法:python wp_parse_docx.py