feishu_fabu/modules/wp_parse_docx.py
wp-publish-bot 1fb93e34c6 feat: 初始化 WordPress 自动发布系统(飞书机器人集成)
- 飞书消息接收与处理(文字、图片、Word 文档)
- WordPress REST API 文章发布
- 图片自动上传到媒体库
- Word 文档解析与发布
- HTML 格式化与分类自动匹配
- Python CLI 工具(避免 shell 引号冲突)
- Webhook 服务器(8080 端口)
- 完整日志系统
2026-05-12 15:09:30 +08:00

446 lines
15 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WordPress 发布系统 - Word 文档解析模块
解析 .docx 文件,提取标题、正文、图片等元素
"""
import os
import re
import base64
import hashlib
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from modules.wp_logger import get_publish_logger, get_debug_logger
# 基础目录
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
TEMP_DIR = os.path.join(BASE_DIR, 'temp')
os.makedirs(TEMP_DIR, exist_ok=True)
class WordParser:
"""Word 文档解析器"""
def __init__(self, file_path):
"""
初始化解析器
Args:
file_path: Word 文档路径
"""
self.file_path = file_path
self.filename = os.path.basename(file_path)
self.doc = None
self.title = ""
self.content_parts = []
self.images = []
self.metadata = {
'paragraph_count': 0,
'image_count': 0,
'word_count': 0
}
self.pl = get_publish_logger()
self.dl = get_debug_logger()
def parse(self):
"""
解析 Word 文档
Returns:
dict: 包含 title, content, images 的字典
"""
self.pl.info(f"📖 开始解析文档:{self.filename}")
try:
# 加载文档
self.doc = Document(self.file_path)
self.dl.log_step("加载文档", f"成功加载:{self.file_path}")
# 提取标题
self._extract_title()
# 提取内容
self._extract_content()
# 提取图片
self._extract_images()
# 统计信息
self._update_metadata()
self.pl.success(f"文档解析完成 - 标题:{self.title},段落数:{self.metadata['paragraph_count']},图片数:{self.metadata['image_count']}")
return {
'title': self.title,
'content': self.content_parts,
'images': self.images,
'metadata': self.metadata
}
except Exception as e:
self.pl.error(f"文档解析失败:{str(e)}")
self.dl.error(f"解析异常:{str(e)}", exc_info=True)
raise
def _extract_title(self):
"""提取文档标题"""
self.dl.log_step("提取标题")
# 方法 1从文档属性获取
if self.doc.core_properties.title:
self.title = self.doc.core_properties.title.strip()
self.dl.debug(f"从文档属性获取标题:{self.title}")
return
# 方法 2从第一个标题样式段落获取
for paragraph in self.doc.paragraphs:
if paragraph.style.name.startswith('Heading'):
self.title = paragraph.text.strip()
self.dl.debug(f"从标题样式获取标题:{self.title}")
return
# 方法 3从第一个加粗大字号段落获取
for paragraph in self.doc.paragraphs:
if paragraph.text.strip() and self._is_title_style(paragraph):
self.title = paragraph.text.strip()
self.dl.debug(f"从样式特征获取标题:{self.title}")
return
# 方法 4使用文件名作为标题
self.title = os.path.splitext(self.filename)[0]
self.dl.warning(f"使用文件名作为标题:{self.title}")
def _extract_content(self):
"""提取文档内容(段落、列表等)"""
self.dl.log_step("提取内容")
content_html = []
in_list = False
list_type = None
list_items = []
for i, paragraph in enumerate(self.doc.paragraphs):
text = paragraph.text.strip()
# 跳过空段落
if not text:
if in_list and list_items:
content_html.extend(self._close_list(list_type, list_items))
list_items = []
in_list = False
continue
# 检测是否为标题
if paragraph.style.name.startswith('Heading 1') or self._is_heading(paragraph, 1):
if in_list and list_items:
content_html.extend(self._close_list(list_type, list_items))
list_items = []
in_list = False
content_html.append(f'<h2>{self._escape_html(text)}</h2>')
self.dl.debug(f"段落 {i}: H2 标题")
continue
if paragraph.style.name.startswith('Heading 2') or self._is_heading(paragraph, 2):
if in_list and list_items:
content_html.extend(self._close_list(list_type, list_items))
list_items = []
in_list = False
content_html.append(f'<h3>{self._escape_html(text)}</h3>')
self.dl.debug(f"段落 {i}: H3 标题")
continue
# 检测是否为列表项
if self._is_list_item(paragraph):
if not in_list:
in_list = True
list_type = 'ol' if self._is_numbered_list(paragraph) else 'ul'
list_items = []
# 清理列表标记
clean_text = self._clean_list_marker(text)
list_items.append(f'<li>{self._format_run_styles(clean_text, paragraph)}</li>')
self.dl.debug(f"段落 {i}: 列表项")
continue
# 普通段落
if in_list and list_items:
content_html.extend(self._close_list(list_type, list_items))
list_items = []
in_list = False
# 处理加粗文本
formatted_text = self._format_run_styles(text, paragraph)
content_html.append(f'<p>{formatted_text}</p>')
self.dl.debug(f"段落 {i}: 普通段落")
# 关闭最后的列表
if in_list and list_items:
content_html.extend(self._close_list(list_type, list_items))
self.content_parts = content_html
def _extract_images(self):
"""提取文档中的图片"""
self.dl.log_step("提取图片")
image_index = 0
for i, paragraph in enumerate(self.doc.paragraphs):
# 检查段落中的图片
for run in paragraph.runs:
if run._element.xml.find('pic:pic') != -1 or run._element.xml.find('w:binData') != -1:
image_data = self._extract_image_from_run(run)
if image_data:
image_index += 1
self.images.append({
'index': image_index,
'paragraph_index': i,
'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
'data': image_data['data'],
'format': image_data['format']
})
self.dl.debug(f"提取图片 {image_index}{self.images[-1]['filename']}")
# 也检查文档关系中的图片
for rel in self.doc.part.rels.values():
if "image" in rel.target_ref:
image_index += 1
image_data = self._extract_image_from_rel(rel)
if image_data:
self.images.append({
'index': image_index,
'paragraph_index': -1,
'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
'data': image_data['data'],
'format': image_data['format']
})
self.dl.debug(f"提取图片 {image_index}(从关系):{self.images[-1]['filename']}")
def _update_metadata(self):
"""更新文档元数据"""
self.metadata['paragraph_count'] = len(self.doc.paragraphs)
self.metadata['image_count'] = len(self.images)
# 粗略计算字数
word_count = sum(len(p.text) for p in self.doc.paragraphs)
self.metadata['word_count'] = word_count
# ========== 辅助方法 ==========
def _is_title_style(self, paragraph):
"""判断段落是否为标题样式"""
if not paragraph.runs:
return False
first_run = paragraph.runs[0]
if first_run.font.size:
size = first_run.font.size.pt
if size and size >= 16:
return True
if first_run.font.bold:
return True
return False
def _is_heading(self, paragraph, level):
"""判断段落是否为指定级别的标题"""
if level == 1:
return (paragraph.runs and
paragraph.runs[0].font.size and
paragraph.runs[0].font.size.pt >= 18 and
paragraph.runs[0].font.bold)
elif level == 2:
return (paragraph.runs and
paragraph.runs[0].font.size and
paragraph.runs[0].font.size.pt >= 14 and
paragraph.runs[0].font.bold)
return False
def _is_list_item(self, paragraph):
"""判断段落是否为列表项"""
text = paragraph.text.strip()
if not text:
return False
# 检查项目符号
if text[0] in ['', '-', '', '', '', '', '']:
return True
# 检查编号
if re.match(r'^\d+[\.\\)]', text):
return True
if re.match(r'^[a-zA-Z][\.\\)]', text):
return True
if re.match(r'^[\(]\d+[\)]', text):
return True
# 检查样式
if 'List' in paragraph.style.name:
return True
return False
def _is_numbered_list(self, paragraph):
"""判断是否为编号列表"""
text = paragraph.text.strip()
return bool(re.match(r'^\d+[\.\\)]', text) or
re.match(r'^[a-zA-Z][\.\\)]', text))
def _clean_list_marker(self, text):
"""清理列表标记"""
# 移除项目符号
if text[0] in ['', '-', '', '', '', '', '']:
text = text[1:].strip()
# 移除编号
text = re.sub(r'^\d+[\.\\)]\s*', '', text)
text = re.sub(r'^[a-zA-Z][\.\\)]\s*', '', text)
text = re.sub(r'^[\(]\d+[\)]\s*', '', text)
return text.strip()
def _format_run_styles(self, text, paragraph):
"""格式化文本样式(加粗、斜体等)"""
if not paragraph.runs:
return self._escape_html(text)
result = []
for run in paragraph.runs:
run_text = run.text
if not run_text:
continue
# 加粗
if run.font.bold:
run_text = f'<strong>{self._escape_html(run_text)}</strong>'
else:
run_text = self._escape_html(run_text)
# 斜体
if run.font.italic:
run_text = f'<em>{run_text}</em>'
result.append(run_text)
return ''.join(result) if result else self._escape_html(text)
def _close_list(self, list_type, items):
"""关闭列表标签"""
if not items:
return []
return [f'<{list_type}>{"".join(items)}</{list_type}>']
def _escape_html(self, text):
"""转义 HTML 特殊字符"""
text = text.replace('&', '&amp;')
text = text.replace('<', '&lt;')
text = text.replace('>', '&gt;')
text = text.replace('"', '&quot;')
return text
def _extract_image_from_run(self, run):
"""从 run 中提取图片"""
try:
# 获取图片二进制数据
xml = run._element.xml
if 'w:binData' in xml:
import xml.etree.ElementTree as ET
from io import BytesIO
# 解析 XML 获取图片数据
root = ET.fromstring(xml)
for elem in root.iter():
if 'binData' in elem.tag and elem.text:
image_data = base64.b64decode(elem.text)
image_hash = hashlib.md5(image_data).hexdigest()
# 检测图片格式
fmt = self._detect_image_format(image_data)
return {
'data': image_data,
'format': fmt,
'hash': image_hash
}
except Exception as e:
self.dl.error(f"提取图片失败:{str(e)}")
return None
def _extract_image_from_rel(self, rel):
"""从关系中提取图片"""
try:
image_part = rel.target_part
image_data = image_part.blob
image_hash = hashlib.md5(image_data).hexdigest()
# 获取格式
content_type = image_part.content_type
fmt = self._content_type_to_format(content_type)
return {
'data': image_data,
'format': fmt,
'hash': image_hash
}
except Exception as e:
self.dl.error(f"从关系提取图片失败:{str(e)}")
return None
def _detect_image_format(self, data):
"""检测图片格式"""
if data[:3] == b'\xff\xd8\xff':
return 'jpg'
elif data[:8] == b'\x89PNG\r\n\x1a\n':
return 'png'
elif data[:6] in (b'GIF87a', b'GIF89a'):
return 'gif'
elif data[:2] == b'BM':
return 'bmp'
return 'jpg' # 默认为 jpg
def _content_type_to_format(self, content_type):
"""将 content_type 转换为格式"""
format_map = {
'image/jpeg': 'jpg',
'image/png': 'png',
'image/gif': 'gif',
'image/bmp': 'bmp',
'image/webp': 'webp'
}
return format_map.get(content_type, 'jpg')
def parse_word_file(file_path):
"""
解析 Word 文件的便捷函数
Args:
file_path: Word 文档路径
Returns:
dict: 解析结果
"""
parser = WordParser(file_path)
return parser.parse()
if __name__ == '__main__':
import sys
if len(sys.argv) < 2:
print("用法python wp_parse_docx.py <word 文件路径>")
sys.exit(1)
result = parse_word_file(sys.argv[1])
print(f"标题:{result['title']}")
print(f"段落数:{result['metadata']['paragraph_count']}")
print(f"图片数:{result['metadata']['image_count']}")
print(f"字数:{result['metadata']['word_count']}")
print(f"\nHTML 内容:\n{''.join(result['content'])}")