- 飞书消息接收与处理(文字、图片、Word 文档) - WordPress REST API 文章发布 - 图片自动上传到媒体库 - Word 文档解析与发布 - HTML 格式化与分类自动匹配 - Python CLI 工具(避免 shell 引号冲突) - Webhook 服务器(8080 端口) - 完整日志系统
446 lines
15 KiB
Python
446 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
WordPress 发布系统 - Word 文档解析模块
|
||
解析 .docx 文件,提取标题、正文、图片等元素
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
import base64
|
||
import hashlib
|
||
from docx import Document
|
||
from docx.shared import Pt, Inches
|
||
from docx.enum.text import WD_ALIGN_PARAGRAPH
|
||
|
||
from modules.wp_logger import get_publish_logger, get_debug_logger
|
||
|
||
# 基础目录
|
||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
TEMP_DIR = os.path.join(BASE_DIR, 'temp')
|
||
os.makedirs(TEMP_DIR, exist_ok=True)
|
||
|
||
|
||
class WordParser:
|
||
"""Word 文档解析器"""
|
||
|
||
def __init__(self, file_path):
|
||
"""
|
||
初始化解析器
|
||
|
||
Args:
|
||
file_path: Word 文档路径
|
||
"""
|
||
self.file_path = file_path
|
||
self.filename = os.path.basename(file_path)
|
||
self.doc = None
|
||
self.title = ""
|
||
self.content_parts = []
|
||
self.images = []
|
||
self.metadata = {
|
||
'paragraph_count': 0,
|
||
'image_count': 0,
|
||
'word_count': 0
|
||
}
|
||
|
||
self.pl = get_publish_logger()
|
||
self.dl = get_debug_logger()
|
||
|
||
def parse(self):
|
||
"""
|
||
解析 Word 文档
|
||
|
||
Returns:
|
||
dict: 包含 title, content, images 的字典
|
||
"""
|
||
self.pl.info(f"📖 开始解析文档:{self.filename}")
|
||
|
||
try:
|
||
# 加载文档
|
||
self.doc = Document(self.file_path)
|
||
self.dl.log_step("加载文档", f"成功加载:{self.file_path}")
|
||
|
||
# 提取标题
|
||
self._extract_title()
|
||
|
||
# 提取内容
|
||
self._extract_content()
|
||
|
||
# 提取图片
|
||
self._extract_images()
|
||
|
||
# 统计信息
|
||
self._update_metadata()
|
||
|
||
self.pl.success(f"文档解析完成 - 标题:{self.title},段落数:{self.metadata['paragraph_count']},图片数:{self.metadata['image_count']}")
|
||
|
||
return {
|
||
'title': self.title,
|
||
'content': self.content_parts,
|
||
'images': self.images,
|
||
'metadata': self.metadata
|
||
}
|
||
|
||
except Exception as e:
|
||
self.pl.error(f"文档解析失败:{str(e)}")
|
||
self.dl.error(f"解析异常:{str(e)}", exc_info=True)
|
||
raise
|
||
|
||
def _extract_title(self):
|
||
"""提取文档标题"""
|
||
self.dl.log_step("提取标题")
|
||
|
||
# 方法 1:从文档属性获取
|
||
if self.doc.core_properties.title:
|
||
self.title = self.doc.core_properties.title.strip()
|
||
self.dl.debug(f"从文档属性获取标题:{self.title}")
|
||
return
|
||
|
||
# 方法 2:从第一个标题样式段落获取
|
||
for paragraph in self.doc.paragraphs:
|
||
if paragraph.style.name.startswith('Heading'):
|
||
self.title = paragraph.text.strip()
|
||
self.dl.debug(f"从标题样式获取标题:{self.title}")
|
||
return
|
||
|
||
# 方法 3:从第一个加粗大字号段落获取
|
||
for paragraph in self.doc.paragraphs:
|
||
if paragraph.text.strip() and self._is_title_style(paragraph):
|
||
self.title = paragraph.text.strip()
|
||
self.dl.debug(f"从样式特征获取标题:{self.title}")
|
||
return
|
||
|
||
# 方法 4:使用文件名作为标题
|
||
self.title = os.path.splitext(self.filename)[0]
|
||
self.dl.warning(f"使用文件名作为标题:{self.title}")
|
||
|
||
def _extract_content(self):
|
||
"""提取文档内容(段落、列表等)"""
|
||
self.dl.log_step("提取内容")
|
||
|
||
content_html = []
|
||
in_list = False
|
||
list_type = None
|
||
list_items = []
|
||
|
||
for i, paragraph in enumerate(self.doc.paragraphs):
|
||
text = paragraph.text.strip()
|
||
|
||
# 跳过空段落
|
||
if not text:
|
||
if in_list and list_items:
|
||
content_html.extend(self._close_list(list_type, list_items))
|
||
list_items = []
|
||
in_list = False
|
||
continue
|
||
|
||
# 检测是否为标题
|
||
if paragraph.style.name.startswith('Heading 1') or self._is_heading(paragraph, 1):
|
||
if in_list and list_items:
|
||
content_html.extend(self._close_list(list_type, list_items))
|
||
list_items = []
|
||
in_list = False
|
||
content_html.append(f'<h2>{self._escape_html(text)}</h2>')
|
||
self.dl.debug(f"段落 {i}: H2 标题")
|
||
continue
|
||
|
||
if paragraph.style.name.startswith('Heading 2') or self._is_heading(paragraph, 2):
|
||
if in_list and list_items:
|
||
content_html.extend(self._close_list(list_type, list_items))
|
||
list_items = []
|
||
in_list = False
|
||
content_html.append(f'<h3>{self._escape_html(text)}</h3>')
|
||
self.dl.debug(f"段落 {i}: H3 标题")
|
||
continue
|
||
|
||
# 检测是否为列表项
|
||
if self._is_list_item(paragraph):
|
||
if not in_list:
|
||
in_list = True
|
||
list_type = 'ol' if self._is_numbered_list(paragraph) else 'ul'
|
||
list_items = []
|
||
|
||
# 清理列表标记
|
||
clean_text = self._clean_list_marker(text)
|
||
list_items.append(f'<li>{self._format_run_styles(clean_text, paragraph)}</li>')
|
||
self.dl.debug(f"段落 {i}: 列表项")
|
||
continue
|
||
|
||
# 普通段落
|
||
if in_list and list_items:
|
||
content_html.extend(self._close_list(list_type, list_items))
|
||
list_items = []
|
||
in_list = False
|
||
|
||
# 处理加粗文本
|
||
formatted_text = self._format_run_styles(text, paragraph)
|
||
content_html.append(f'<p>{formatted_text}</p>')
|
||
self.dl.debug(f"段落 {i}: 普通段落")
|
||
|
||
# 关闭最后的列表
|
||
if in_list and list_items:
|
||
content_html.extend(self._close_list(list_type, list_items))
|
||
|
||
self.content_parts = content_html
|
||
|
||
def _extract_images(self):
|
||
"""提取文档中的图片"""
|
||
self.dl.log_step("提取图片")
|
||
|
||
image_index = 0
|
||
|
||
for i, paragraph in enumerate(self.doc.paragraphs):
|
||
# 检查段落中的图片
|
||
for run in paragraph.runs:
|
||
if run._element.xml.find('pic:pic') != -1 or run._element.xml.find('w:binData') != -1:
|
||
image_data = self._extract_image_from_run(run)
|
||
if image_data:
|
||
image_index += 1
|
||
self.images.append({
|
||
'index': image_index,
|
||
'paragraph_index': i,
|
||
'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
|
||
'data': image_data['data'],
|
||
'format': image_data['format']
|
||
})
|
||
self.dl.debug(f"提取图片 {image_index}:{self.images[-1]['filename']}")
|
||
|
||
# 也检查文档关系中的图片
|
||
for rel in self.doc.part.rels.values():
|
||
if "image" in rel.target_ref:
|
||
image_index += 1
|
||
image_data = self._extract_image_from_rel(rel)
|
||
if image_data:
|
||
self.images.append({
|
||
'index': image_index,
|
||
'paragraph_index': -1,
|
||
'filename': f"image_{image_index}_{image_data['hash'][:8]}.{image_data['format']}",
|
||
'data': image_data['data'],
|
||
'format': image_data['format']
|
||
})
|
||
self.dl.debug(f"提取图片 {image_index}(从关系):{self.images[-1]['filename']}")
|
||
|
||
def _update_metadata(self):
|
||
"""更新文档元数据"""
|
||
self.metadata['paragraph_count'] = len(self.doc.paragraphs)
|
||
self.metadata['image_count'] = len(self.images)
|
||
|
||
# 粗略计算字数
|
||
word_count = sum(len(p.text) for p in self.doc.paragraphs)
|
||
self.metadata['word_count'] = word_count
|
||
|
||
# ========== 辅助方法 ==========
|
||
|
||
def _is_title_style(self, paragraph):
|
||
"""判断段落是否为标题样式"""
|
||
if not paragraph.runs:
|
||
return False
|
||
|
||
first_run = paragraph.runs[0]
|
||
if first_run.font.size:
|
||
size = first_run.font.size.pt
|
||
if size and size >= 16:
|
||
return True
|
||
|
||
if first_run.font.bold:
|
||
return True
|
||
|
||
return False
|
||
|
||
def _is_heading(self, paragraph, level):
|
||
"""判断段落是否为指定级别的标题"""
|
||
if level == 1:
|
||
return (paragraph.runs and
|
||
paragraph.runs[0].font.size and
|
||
paragraph.runs[0].font.size.pt >= 18 and
|
||
paragraph.runs[0].font.bold)
|
||
elif level == 2:
|
||
return (paragraph.runs and
|
||
paragraph.runs[0].font.size and
|
||
paragraph.runs[0].font.size.pt >= 14 and
|
||
paragraph.runs[0].font.bold)
|
||
return False
|
||
|
||
def _is_list_item(self, paragraph):
|
||
"""判断段落是否为列表项"""
|
||
text = paragraph.text.strip()
|
||
if not text:
|
||
return False
|
||
|
||
# 检查项目符号
|
||
if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']:
|
||
return True
|
||
|
||
# 检查编号
|
||
if re.match(r'^\d+[\.\、\)]', text):
|
||
return True
|
||
if re.match(r'^[a-zA-Z][\.\、\)]', text):
|
||
return True
|
||
if re.match(r'^[(\(]\d+[)\)]', text):
|
||
return True
|
||
|
||
# 检查样式
|
||
if 'List' in paragraph.style.name:
|
||
return True
|
||
|
||
return False
|
||
|
||
def _is_numbered_list(self, paragraph):
|
||
"""判断是否为编号列表"""
|
||
text = paragraph.text.strip()
|
||
return bool(re.match(r'^\d+[\.\、\)]', text) or
|
||
re.match(r'^[a-zA-Z][\.\、\)]', text))
|
||
|
||
def _clean_list_marker(self, text):
|
||
"""清理列表标记"""
|
||
# 移除项目符号
|
||
if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹']:
|
||
text = text[1:].strip()
|
||
|
||
# 移除编号
|
||
text = re.sub(r'^\d+[\.\、\)]\s*', '', text)
|
||
text = re.sub(r'^[a-zA-Z][\.\、\)]\s*', '', text)
|
||
text = re.sub(r'^[(\(]\d+[)\)]\s*', '', text)
|
||
|
||
return text.strip()
|
||
|
||
def _format_run_styles(self, text, paragraph):
|
||
"""格式化文本样式(加粗、斜体等)"""
|
||
if not paragraph.runs:
|
||
return self._escape_html(text)
|
||
|
||
result = []
|
||
for run in paragraph.runs:
|
||
run_text = run.text
|
||
if not run_text:
|
||
continue
|
||
|
||
# 加粗
|
||
if run.font.bold:
|
||
run_text = f'<strong>{self._escape_html(run_text)}</strong>'
|
||
else:
|
||
run_text = self._escape_html(run_text)
|
||
|
||
# 斜体
|
||
if run.font.italic:
|
||
run_text = f'<em>{run_text}</em>'
|
||
|
||
result.append(run_text)
|
||
|
||
return ''.join(result) if result else self._escape_html(text)
|
||
|
||
def _close_list(self, list_type, items):
|
||
"""关闭列表标签"""
|
||
if not items:
|
||
return []
|
||
return [f'<{list_type}>{"".join(items)}</{list_type}>']
|
||
|
||
def _escape_html(self, text):
|
||
"""转义 HTML 特殊字符"""
|
||
text = text.replace('&', '&')
|
||
text = text.replace('<', '<')
|
||
text = text.replace('>', '>')
|
||
text = text.replace('"', '"')
|
||
return text
|
||
|
||
def _extract_image_from_run(self, run):
|
||
"""从 run 中提取图片"""
|
||
try:
|
||
# 获取图片二进制数据
|
||
xml = run._element.xml
|
||
if 'w:binData' in xml:
|
||
import xml.etree.ElementTree as ET
|
||
from io import BytesIO
|
||
|
||
# 解析 XML 获取图片数据
|
||
root = ET.fromstring(xml)
|
||
for elem in root.iter():
|
||
if 'binData' in elem.tag and elem.text:
|
||
image_data = base64.b64decode(elem.text)
|
||
image_hash = hashlib.md5(image_data).hexdigest()
|
||
|
||
# 检测图片格式
|
||
fmt = self._detect_image_format(image_data)
|
||
|
||
return {
|
||
'data': image_data,
|
||
'format': fmt,
|
||
'hash': image_hash
|
||
}
|
||
except Exception as e:
|
||
self.dl.error(f"提取图片失败:{str(e)}")
|
||
|
||
return None
|
||
|
||
def _extract_image_from_rel(self, rel):
|
||
"""从关系中提取图片"""
|
||
try:
|
||
image_part = rel.target_part
|
||
image_data = image_part.blob
|
||
image_hash = hashlib.md5(image_data).hexdigest()
|
||
|
||
# 获取格式
|
||
content_type = image_part.content_type
|
||
fmt = self._content_type_to_format(content_type)
|
||
|
||
return {
|
||
'data': image_data,
|
||
'format': fmt,
|
||
'hash': image_hash
|
||
}
|
||
except Exception as e:
|
||
self.dl.error(f"从关系提取图片失败:{str(e)}")
|
||
|
||
return None
|
||
|
||
def _detect_image_format(self, data):
|
||
"""检测图片格式"""
|
||
if data[:3] == b'\xff\xd8\xff':
|
||
return 'jpg'
|
||
elif data[:8] == b'\x89PNG\r\n\x1a\n':
|
||
return 'png'
|
||
elif data[:6] in (b'GIF87a', b'GIF89a'):
|
||
return 'gif'
|
||
elif data[:2] == b'BM':
|
||
return 'bmp'
|
||
return 'jpg' # 默认为 jpg
|
||
|
||
def _content_type_to_format(self, content_type):
|
||
"""将 content_type 转换为格式"""
|
||
format_map = {
|
||
'image/jpeg': 'jpg',
|
||
'image/png': 'png',
|
||
'image/gif': 'gif',
|
||
'image/bmp': 'bmp',
|
||
'image/webp': 'webp'
|
||
}
|
||
return format_map.get(content_type, 'jpg')
|
||
|
||
|
||
def parse_word_file(file_path):
|
||
"""
|
||
解析 Word 文件的便捷函数
|
||
|
||
Args:
|
||
file_path: Word 文档路径
|
||
|
||
Returns:
|
||
dict: 解析结果
|
||
"""
|
||
parser = WordParser(file_path)
|
||
return parser.parse()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
import sys
|
||
if len(sys.argv) < 2:
|
||
print("用法:python wp_parse_docx.py <word 文件路径>")
|
||
sys.exit(1)
|
||
|
||
result = parse_word_file(sys.argv[1])
|
||
print(f"标题:{result['title']}")
|
||
print(f"段落数:{result['metadata']['paragraph_count']}")
|
||
print(f"图片数:{result['metadata']['image_count']}")
|
||
print(f"字数:{result['metadata']['word_count']}")
|
||
print(f"\nHTML 内容:\n{''.join(result['content'])}")
|