- 飞书消息接收与处理(文字、图片、Word 文档) - WordPress REST API 文章发布 - 图片自动上传到媒体库 - Word 文档解析与发布 - HTML 格式化与分类自动匹配 - Python CLI 工具(避免 shell 引号冲突) - Webhook 服务器(8080 端口) - 完整日志系统
292 lines
8.4 KiB
Python
292 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
WordPress 发布系统 - HTML 格式化模块
|
||
将解析后的内容转换为 WordPress 可用的 HTML 格式
|
||
"""
|
||
|
||
import os
|
||
import re
|
||
from modules.wp_logger import get_publish_logger, get_debug_logger
|
||
|
||
# 基础配置
|
||
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||
|
||
|
||
class HTMLFormatter:
|
||
"""HTML 格式化器"""
|
||
|
||
def __init__(self):
|
||
self.pl = get_publish_logger()
|
||
self.dl = get_debug_logger()
|
||
|
||
def format_content(self, content_parts, uploaded_images=None):
|
||
"""
|
||
格式化内容为完整的 HTML
|
||
|
||
Args:
|
||
content_parts: 内容片段列表
|
||
uploaded_images: 已上传图片的列表,包含 url, index 等信息
|
||
|
||
Returns:
|
||
str: 完整的 HTML 内容
|
||
"""
|
||
self.dl.log_step("格式化 HTML 内容")
|
||
|
||
if not content_parts:
|
||
return ""
|
||
|
||
html_parts = []
|
||
|
||
for i, part in enumerate(content_parts):
|
||
# 如果是图片占位符,替换为实际图片 URL
|
||
if isinstance(part, dict) and part.get('type') == 'image_placeholder':
|
||
if uploaded_images:
|
||
img_index = part.get('index', 0)
|
||
if img_index <= len(uploaded_images):
|
||
img = uploaded_images[img_index - 1]
|
||
if 'url' in img:
|
||
html_parts.append(f'<img src="{img["url"]}" alt="图片 {img_index}" style="max-width: 100%; height: auto; display: block; margin: 16px auto;">')
|
||
continue
|
||
|
||
html_parts.append(part)
|
||
|
||
# 合并 HTML
|
||
full_html = '\n\n'.join(html_parts)
|
||
|
||
# 优化 HTML 结构
|
||
full_html = self._optimize_html(full_html)
|
||
|
||
self.dl.debug(f"HTML 内容长度:{len(full_html)} 字符")
|
||
return full_html
|
||
|
||
def format_text_content(self, text, images=None):
|
||
"""
|
||
格式化纯文本内容为 HTML
|
||
|
||
Args:
|
||
text: 文本内容
|
||
images: 图片列表
|
||
|
||
Returns:
|
||
str: HTML 内容
|
||
"""
|
||
self.dl.log_step("格式化文本内容")
|
||
|
||
if not text:
|
||
return ""
|
||
|
||
# 分割段落
|
||
paragraphs = self._split_paragraphs(text)
|
||
|
||
html_parts = []
|
||
for para in paragraphs:
|
||
para = para.strip()
|
||
if not para:
|
||
continue
|
||
|
||
# 检测是否为标题
|
||
if self._is_title(para):
|
||
html_parts.append(f'<h2>{self._escape_html(para)}</h2>')
|
||
# 检测是否为列表
|
||
elif self._is_list(para):
|
||
html_parts.append(self._format_list(para))
|
||
# 普通段落
|
||
else:
|
||
html_parts.append(f'<p>{self._format_text_styles(para)}</p>')
|
||
|
||
return '\n\n'.join(html_parts)
|
||
|
||
def generate_excerpt(self, content, max_length=200):
|
||
"""
|
||
生成文章摘要
|
||
|
||
Args:
|
||
content: HTML 内容
|
||
max_length: 最大长度
|
||
|
||
Returns:
|
||
str: 摘要
|
||
"""
|
||
# 移除 HTML 标签
|
||
text = re.sub(r'<[^>]+>', '', content)
|
||
|
||
# 截断
|
||
if len(text) > max_length:
|
||
text = text[:max_length] + '...'
|
||
|
||
return text.strip()
|
||
|
||
def extract_title_from_content(self, content):
|
||
"""
|
||
从内容中提取标题
|
||
|
||
Args:
|
||
content: 文本内容
|
||
|
||
Returns:
|
||
str: 标题
|
||
"""
|
||
lines = content.strip().split('\n')
|
||
|
||
# 查找第一个非空行
|
||
for line in lines:
|
||
line = line.strip()
|
||
if line:
|
||
# 如果很短,可能是标题
|
||
if len(line) < 50:
|
||
return line
|
||
# 否则取前 30 个字符
|
||
return line[:30]
|
||
|
||
return "无标题文章"
|
||
|
||
def _optimize_html(self, html):
|
||
"""优化 HTML 结构"""
|
||
# 移除多余的空行
|
||
html = re.sub(r'\n{3,}', '\n\n', html)
|
||
|
||
# 确保段落之间有空行
|
||
html = re.sub(r'</p>\s*<p>', '</p>\n\n<p>', html)
|
||
|
||
# 移除空段落
|
||
html = re.sub(r'<p>\s*</p>', '', html)
|
||
|
||
return html
|
||
|
||
def _split_paragraphs(self, text):
|
||
"""分割段落"""
|
||
# 按双换行符分割
|
||
paragraphs = re.split(r'\n\n+', text)
|
||
|
||
# 也按单换行符分割(处理 Word 文档)
|
||
result = []
|
||
for para in paragraphs:
|
||
sub_paras = re.split(r'\n+', para.strip())
|
||
result.extend(sub_paras)
|
||
|
||
return result
|
||
|
||
def _is_title(self, text):
|
||
"""判断是否为标题"""
|
||
# 标题通常较短且没有标点
|
||
if len(text) > 60:
|
||
return False
|
||
|
||
# 检查是否以标题标记开头
|
||
if text.startswith('#'):
|
||
return True
|
||
|
||
# 检查是否没有句号
|
||
if not text.endswith(('。', '.', '!', '!', '?', '?')):
|
||
return True
|
||
|
||
return False
|
||
|
||
def _is_list(self, text):
|
||
"""判断是否为列表"""
|
||
text = text.strip()
|
||
|
||
# 检查项目符号
|
||
if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹', '*']:
|
||
return True
|
||
|
||
# 检查编号
|
||
if re.match(r'^\d+[\.\、\)]', text):
|
||
return True
|
||
if re.match(r'^[a-zA-Z][\.\、\)]', text):
|
||
return True
|
||
|
||
return False
|
||
|
||
def _format_list(self, text):
|
||
"""格式化列表"""
|
||
lines = text.strip().split('\n')
|
||
items = []
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# 清理列表标记
|
||
clean_text = re.sub(r'^[•\-\–\—▪▸▹*]\s*', '', line)
|
||
clean_text = re.sub(r'^\d+[\.\、\)]\s*', '', clean_text)
|
||
clean_text = re.sub(r'^[a-zA-Z][\.\、\)]\s*', '', clean_text)
|
||
|
||
items.append(f'<li>{self._format_text_styles(clean_text)}</li>')
|
||
|
||
# 判断列表类型
|
||
is_ordered = bool(re.match(r'^\d+', lines[0])) if lines else False
|
||
list_tag = 'ol' if is_ordered else 'ul'
|
||
|
||
return f'<{list_tag}>\n{"".join(items)}\n</{list_tag}>'
|
||
|
||
def _format_text_styles(self, text):
|
||
"""格式化文本样式(加粗、斜体等)"""
|
||
# 处理 Markdown 风格的加粗
|
||
text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
|
||
text = re.sub(r'__(.+?)__', r'<strong>\1</strong>', text)
|
||
|
||
# 处理斜体
|
||
text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
|
||
text = re.sub(r'_(.+?)_', r'<em>\1</em>', text)
|
||
|
||
# 处理链接
|
||
text = re.sub(r'\[(.+?)\]\((.+?)\)', r'<a href="\2">\1</a>', text)
|
||
|
||
return self._escape_html(text)
|
||
|
||
def _escape_html(self, text):
|
||
"""转义 HTML 特殊字符(保留已处理的标签)"""
|
||
# 先保护已处理的 HTML 标签
|
||
protected = []
|
||
|
||
def protect(match):
|
||
protected.append(match.group(0))
|
||
return f'__PROTECTED_{len(protected)-1}__'
|
||
|
||
# 保护已有的 HTML 标签
|
||
text = re.sub(r'<(strong|em|a|li|ol|ul)[^>]*>.*?</\1>', protect, text)
|
||
text = re.sub(r'<(strong|em|a)[^>]*/>', protect, text)
|
||
|
||
# 转义其他特殊字符
|
||
text = text.replace('&', '&')
|
||
text = text.replace('<', '<')
|
||
text = text.replace('>', '>')
|
||
text = text.replace('"', '"')
|
||
|
||
# 恢复保护的标签
|
||
for i, tag in enumerate(protected):
|
||
text = text.replace(f'__PROTECTED_{i}__', tag)
|
||
|
||
return text
|
||
|
||
|
||
def create_formatter():
|
||
"""创建格式化器实例"""
|
||
return HTMLFormatter()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
formatter = create_formatter()
|
||
|
||
# 测试
|
||
test_content = """# 这是一个标题
|
||
|
||
这是第一段内容。
|
||
|
||
- 列表项 1
|
||
- 列表项 2
|
||
- 列表项 3
|
||
|
||
这是第二段内容,包含 **加粗** 和 *斜体*。
|
||
|
||
1. 编号列表 1
|
||
2. 编号列表 2
|
||
"""
|
||
|
||
html = formatter.format_text_content(test_content)
|
||
print("格式化后的 HTML:")
|
||
print(html)
|