feishu_fabu/modules/wp_formatter.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
WordPress 发布系统 - HTML 格式化模块
将解析后的内容转换为 WordPress 可用的 HTML 格式
"""

import os
import re
from modules.wp_logger import get_publish_logger, get_debug_logger

# 基础配置
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))


class HTMLFormatter:
    """HTML 格式化器"""

    def __init__(self):
        self.pl = get_publish_logger()
        self.dl = get_debug_logger()

    def format_content(self, content_parts, uploaded_images=None):
        """
        格式化内容为完整的 HTML

        Args:
            content_parts: 内容片段列表
            uploaded_images: 已上传图片的列表，包含 url, index 等信息

        Returns:
            str: 完整的 HTML 内容
        """
        self.dl.log_step("格式化 HTML 内容")

        if not content_parts:
            return ""

        html_parts = []

        for i, part in enumerate(content_parts):
            # 如果是图片占位符，替换为实际图片 URL
            if isinstance(part, dict) and part.get('type') == 'image_placeholder':
                if uploaded_images:
                    img_index = part.get('index', 0)
                    if img_index <= len(uploaded_images):
                        img = uploaded_images[img_index - 1]
                        if 'url' in img:
                            html_parts.append(f'<img src="{img["url"]}" alt="图片 {img_index}" style="max-width: 100%; height: auto; display: block; margin: 16px auto;">')
                            continue

            html_parts.append(part)

        # 合并 HTML
        full_html = '\n\n'.join(html_parts)

        # 优化 HTML 结构
        full_html = self._optimize_html(full_html)

        self.dl.debug(f"HTML 内容长度：{len(full_html)} 字符")
        return full_html

    def format_text_content(self, text, images=None):
        """
        格式化纯文本内容为 HTML

        Args:
            text: 文本内容
            images: 图片列表

        Returns:
            str: HTML 内容
        """
        self.dl.log_step("格式化文本内容")

        if not text:
            return ""

        # 分割段落
        paragraphs = self._split_paragraphs(text)

        html_parts = []
        for para in paragraphs:
            para = para.strip()
            if not para:
                continue

            # 检测是否为标题
            if self._is_title(para):
                html_parts.append(f'<h2>{self._escape_html(para)}</h2>')
            # 检测是否为列表
            elif self._is_list(para):
                html_parts.append(self._format_list(para))
            # 普通段落
            else:
                html_parts.append(f'<p>{self._format_text_styles(para)}</p>')

        return '\n\n'.join(html_parts)

    def generate_excerpt(self, content, max_length=200):
        """
        生成文章摘要

        Args:
            content: HTML 内容
            max_length: 最大长度

        Returns:
            str: 摘要
        """
        # 移除 HTML 标签
        text = re.sub(r'<[^>]+>', '', content)

        # 截断
        if len(text) > max_length:
            text = text[:max_length] + '...'

        return text.strip()

    def extract_title_from_content(self, content):
        """
        从内容中提取标题

        Args:
            content: 文本内容

        Returns:
            str: 标题
        """
        lines = content.strip().split('\n')

        # 查找第一个非空行
        for line in lines:
            line = line.strip()
            if line:
                # 如果很短，可能是标题
                if len(line) < 50:
                    return line
                # 否则取前 30 个字符
                return line[:30]

        return "无标题文章"

    def _optimize_html(self, html):
        """优化 HTML 结构"""
        # 移除多余的空行
        html = re.sub(r'\n{3,}', '\n\n', html)

        # 确保段落之间有空行
        html = re.sub(r'</p>\s*<p>', '</p>\n\n<p>', html)

        # 移除空段落
        html = re.sub(r'<p>\s*</p>', '', html)

        return html

    def _split_paragraphs(self, text):
        """分割段落"""
        # 按双换行符分割
        paragraphs = re.split(r'\n\n+', text)

        # 也按单换行符分割（处理 Word 文档）
        result = []
        for para in paragraphs:
            sub_paras = re.split(r'\n+', para.strip())
            result.extend(sub_paras)

        return result

    def _is_title(self, text):
        """判断是否为标题"""
        # 标题通常较短且没有标点
        if len(text) > 60:
            return False

        # 检查是否以标题标记开头
        if text.startswith('#'):
            return True

        # 检查是否没有句号
        if not text.endswith(('。', '.', '！', '!', '？', '?')):
            return True

        return False

    def _is_list(self, text):
        """判断是否为列表"""
        text = text.strip()

        # 检查项目符号
        if text[0] in ['•', '-', '–', '—', '▪', '▸', '▹', '*']:
            return True

        # 检查编号
        if re.match(r'^\d+[\.\、\)]', text):
            return True
        if re.match(r'^[a-zA-Z][\.\、\)]', text):
            return True

        return False

    def _format_list(self, text):
        """格式化列表"""
        lines = text.strip().split('\n')
        items = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # 清理列表标记
            clean_text = re.sub(r'^[•\-\–\—▪▸▹*]\s*', '', line)
            clean_text = re.sub(r'^\d+[\.\、\)]\s*', '', clean_text)
            clean_text = re.sub(r'^[a-zA-Z][\.\、\)]\s*', '', clean_text)

            items.append(f'<li>{self._format_text_styles(clean_text)}</li>')

        # 判断列表类型
        is_ordered = bool(re.match(r'^\d+', lines[0])) if lines else False
        list_tag = 'ol' if is_ordered else 'ul'

        return f'<{list_tag}>\n{"".join(items)}\n</{list_tag}>'

    def _format_text_styles(self, text):
        """格式化文本样式（加粗、斜体等）"""
        # 处理 Markdown 风格的加粗
        text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
        text = re.sub(r'__(.+?)__', r'<strong>\1</strong>', text)

        # 处理斜体
        text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
        text = re.sub(r'_(.+?)_', r'<em>\1</em>', text)

        # 处理链接
        text = re.sub(r'\[(.+?)\]\((.+?)\)', r'<a href="\2">\1</a>', text)

        return self._escape_html(text)

    def _escape_html(self, text):
        """转义 HTML 特殊字符（保留已处理的标签）"""
        # 先保护已处理的 HTML 标签
        protected = []

        def protect(match):
            protected.append(match.group(0))
            return f'__PROTECTED_{len(protected)-1}__'

        # 保护已有的 HTML 标签
        text = re.sub(r'<(strong|em|a|li|ol|ul)[^>]*>.*?</\1>', protect, text)
        text = re.sub(r'<(strong|em|a)[^>]*/>', protect, text)

        # 转义其他特殊字符
        text = text.replace('&', '&amp;')
        text = text.replace('<', '&lt;')
        text = text.replace('>', '&gt;')
        text = text.replace('"', '&quot;')

        # 恢复保护的标签
        for i, tag in enumerate(protected):
            text = text.replace(f'__PROTECTED_{i}__', tag)

        return text


def create_formatter():
    """创建格式化器实例"""
    return HTMLFormatter()


if __name__ == '__main__':
    formatter = create_formatter()

    # 测试
    test_content = """# 这是一个标题

这是第一段内容。

- 列表项 1
- 列表项 2
- 列表项 3

这是第二段内容，包含 **加粗** 和 *斜体*。

1. 编号列表 1
2. 编号列表 2
"""

    html = formatter.format_text_content(test_content)
    print("格式化后的 HTML:")
    print(html)