Node ? trim($titleNode->textContent) : '未命名'; // 提取发表时间 $dateNode = $xpath->query("//time")->item(0); $date = $dateNode ? trim($dateNode->textContent) : '未知日期'; // 提取分类 $categoryNode = $xpath->query("//div[@class='category']")->item(0); $category = $categoryNode ? trim($categoryNode->textContent) : '未分类'; // 提取正文内容 $bodyNodes = $xpath->query("//div[@class='content']//p"); $body = ''; foreach ($bodyNodes as $node) { $body .= trim($node->textContent) . "\n"; } $body = trim($body) ?: '无内容'; // 拼接保存内容 $content = $title . "\n" . $date . "\n" . $category . "\n\n" . $body; // 生成文件名，去除不合法字符 $filename = preg_replace('/[\/:*?"<>|]/', '', $title) . '.txt'; // 如果生成的文件名为空或与已存在的文件名冲突，使用原文件名作为基础 if (empty($filename) || file_exists($filename)) { $filename = pathinfo($htmlFile, PATHINFO_FILENAME) . '.txt'; } // 保存到文件 file_put_contents($filename, $content); echo "保存成功：$filename\n"; ;} ?>


**答：** 根据html代码给出新的答案

```html
 <?php
// 获取当前目录下所有 .html 文件
$htmlFiles = glob("*.html");
foreach ($htmlFiles as $htmlFile) {
    // 加载并解析HTML文件
    $html = file_get_contents($htmlFile);
    $dom = new DOMDocument;
    libxml_use_internal_errors(true); // 忽略HTML格式错误
    $dom->loadHTML($html);
    libxml_clear_errors();
    $xpath = new DOMXPath($dom);
    // 提取标题
    $titleNode = $xpath->query("//title")->item(0);
    $title = $titleNode ? trim($titleNode->textContent) : '未命名';

    // 提取发表时间
    $dateNode = $xpath->query("//div[@class='person-info']//span")->item(0);
    $date = $dateNode ? trim($dateNode->textContent) : '未知日期';
    // 提取分类（只提取"[]"内部的内容）
    $categoryNode = $xpath->query("//div[@class='post-header-right']//h4[@class='post-title']/a")->item(0);
    preg_match('/\[(.*?)\]/', $categoryNode->textContent, $matches);
    $category = isset($matches[1]) ? trim($matches[1]) : '未分类';
    // 提取正文内容（第一个 class="post-contents"）
    $bodyNode = $xpath->query("//div[@class='post-contents']")->item(0);
    $body = $bodyNode ? trim($bodyNode->textContent) : '无内容';
    // 拼接保存内容
    $content = $title . "\n" . $date . "\n" . $category . "\n\n" . $body;
    // 生成文件名，去除不合法字符
    $filename = preg_replace('/[\/:*?"<>|]/', '', $title) . '.txt';
    // 如果生成的文件名为空或与已存在的文件名冲突，使用原文件名作为基础
    if (empty($filename) || file_exists($filename)) {
        $filename = pathinfo($htmlFile, PATHINFO_FILENAME) . '.txt';
    }
    // 保存到文件
    file_put_contents($filename, $content);
    echo "保存成功：$filename\n";
;}
?>

验证结果： 依然无法提取正确的文章正文与分类信息

2. 偷懒失败，尝试手工分析html结构，将对应的结构代码给gtp，(成功识别出分类与正文)提取ok

原文链接：示例：问：手工提取结构代码并附上之前报错代码，给出要求

          1、）<title>根据句柄取进程路径-紫灵幽梦</title>这是标题，修改为只提取-前面字符；
          2、）<div class="person-info"><span style="font-size: 75%;color: #393939;">
              2016-11-8  <br />18:29</span> </div> 这是发布日期；
          3、）<div class="post-header-right"><h4 class="post-title"><a>[程序代码]根据句
              柄取进程路径 </a> </h4></div>这是发分类，只提取"[]"内部，只提取第一个class="post-title"内容；
          4、）<div class="post-contents">~</div>这是正文部分，只提取第一个class="post-contents"内容；

要求：根据以上内容修改下面代码：

<?php
// 获取当前目录下所有 .html 文件
$htmlFiles = glob("*.html");
foreach ($htmlFiles as $htmlFile) {
    // 加载并解析HTML文件
    $html = file_get_contents($htmlFile);
    $dom = new DOMDocument;
    libxml_use_internal_errors(true); // 忽略HTML格式错误
    $dom->loadHTML($html);
    libxml_clear_errors();
    $xpath = new DOMXPath($dom);
    // 提取标题
    $titleNode = $xpath->query("//h1")->item(0);
    $title = $titleNode ? trim($titleNode->textContent) : '未命名';
    // 提取发表时间
    $dateNode = $xpath->query("//time")->item(0);
    $date = $dateNode ? trim($dateNode->textContent) : '未知日期';
    // 提取分类
    $categoryNode = $xpath->query("//div[@class='category']")->item(0);
    $category = $categoryNode ? trim($categoryNode->textContent) : '未分类';
    // 提取正文内容
    $bodyNodes = $xpath->query("//div[@class='content']//p");
    $body = '';
    foreach ($bodyNodes as $node) {
        $body .= trim($node->textContent) . "\n";
    }
    $body = trim($body) ?: '无内容';
    // 拼接保存内容
    $content = $title . "\n" . $date . "\n" . $category . "\n\n" . $body;
    // 生成文件名，去除不合法字符
    $filename = preg_replace('/[\/:*?"<>|]/', '', $title) . '.txt';
    // 如果生成的文件名为空或与已存在的文件名冲突，使用原文件名作为基础
    if (empty($filename) || file_exists($filename)) {
        $filename = pathinfo($htmlFile, PATHINFO_FILENAME) . '.txt';
    }
    // 保存到文件
    file_put_contents($filename, $content);
    echo "保存成功：$filename\n";
}
?>

答：

     <?php
// 获取当前目录下所有 .html 文件
$htmlFiles = glob("*.html");
foreach ($htmlFiles as $htmlFile) {
    // 加载并解析HTML文件
    $html = file_get_contents($htmlFile);
    $dom = new DOMDocument;
    libxml_use_internal_errors(true); // 忽略HTML格式错误
    $dom->loadHTML($html);
    libxml_clear_errors();
    $xpath = new DOMXPath($dom);
    // 提取标题
    $titleNode = $xpath->query("//title")->item(0);
    $title = $titleNode ? trim($titleNode->textContent) : '未命名';
    // 提取发表时间
    $dateNode = $xpath->query("//div[@class='person-info']//span")->item(0);
    $date = $dateNode ? trim($dateNode->textContent) : '未知日期';
    // 提取分类（只提取"[]"内部的内容）
    $categoryNode = $xpath->query("//div[@class='post-header-right']//h4[@class='post-title']/a")->item(0);
    preg_match('/\[(.*?)\]/', $categoryNode->textContent, $matches);
    $category = isset($matches[1]) ? trim($matches[1]) : '未分类';
    // 提取正文内容（第一个 class="post-contents"）
    $bodyNode = $xpath->query("//div[@class='post-contents']")->item(0);
    $body = $bodyNode ? trim($bodyNode->textContent) : '无内容';
    // 拼接保存内容
    $content = $title . "\n" . $date . "\n" . $category . "\n\n" . $body;
    // 生成文件名，去除不合法字符
    $filename = preg_replace('/[\/:*?"<>|]/', '', $title) . '.txt';
    // 如果生成的文件名为空或与已存在的文件名冲突，使用原文件名作为基础
    if (empty($filename) || file_exists($filename)) {
        $filename = pathinfo($htmlFile, PATHINFO_FILENAME) . '.txt';
    }
    // 保存到文件
    file_put_contents($filename, $content);
    echo "保存成功：$filename\n";
}
?>

提取内容标题不满意（之前网站名与现在不符合，继续追问）问：修改为只提取-前面字符；
```
<title>根据句柄取进程路径-紫灵幽梦</title>这是标题
```
答：重新修正代码如下：

希米日志

主页 代码 笔记 软件 闲谈 留言
更多
虫洞 友链 关于

2. 偷懒失败，尝试手工分析html结构，将对应的结构代码给gtp，(成功识别出分类与正文)提取ok

目录

希米日志

主页 代码 笔记 软件 闲谈 留言 更多 虫洞 友链 关于

2. 偷懒失败，尝试手工分析html结构，将对应的结构代码给gtp，(成功识别出分类与正文)提取ok

目 录

主页代码笔记软件闲谈留言
更多
虫洞友链关于

目录