feat: add data visualization script for comprehensive analysis

- Introduced `visualize.py` to generate various visualizations including: - Class distribution bar chart - Feature distribution histograms (overlaid by class) - Feature box plots (for top N features) - PCA reduced scatter plot with confidence ellipses - t-SNE reduced scatter plot - Heatmaps for class means and standard deviations - Global feature correlation heatmap - Overview of global feature distributions - Implemented data loading functions to handle multiple file formats and structures. - Added command-line interface for flexible usage with options for feature limits and PCA/t-SNE toggles. Co-authored-by: Copilot <copilot@github.com>
2026-05-14 10:27:32 +08:00 · 2026-05-14 10:27:32 +08:00 · a7e95141d2
parent 5f58d7fb56
commit a7e95141d2
4 changed files with 1293 additions and 5 deletions
--- a/Qfunctions/saveToXlsx.py
+++ b/Qfunctions/saveToXlsx.py
@ -143,7 +143,7 @@ def draw_and_save_cm(file_path):
  df_cm = pd.read_excel(file_path)

  labels = df_cm.columns[1:].tolist()
-  cm = df_cm.values[:, 1:]
+  cm = df_cm.iloc[:, 1:].to_numpy(dtype=float)

  fig, axs = plt.subplots(1, 2, figsize=(12, 6))

--- a/Scripts/check_data.py
+++ b/Scripts/check_data.py
@ -0,0 +1,631 @@
+#!/usr/bin/env python3
+"""
+数据质量检查脚本
+===================
+对 Static/ 下的数据目录执行完整性、统计、平衡性与离群值检查，
+生成详细报告输出到终端，并可保存为文本文件。
+
+用法:
+    python Scripts/check_data.py --folder 20260319Numbers --labels 0 1 2 3 4 5 6 7 8 9
+    python Scripts/check_data.py --folder "20260408 grap" --labels 1 2 3 4 5 6 7 8 9 --output report.txt
+    python Scripts/check_data.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9
+
+要求:
+    在项目根目录 (Deeplearning/) 下运行，或通过 --root 指定项目根目录。
+"""
+
+import os
+import sys
+import argparse
+import unicodedata
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+
+
+# ============================================================
+# 工具函数（与 loadData.py 中逻辑保持一致）
+# ============================================================
+
+DEFAULT_FILE_CLASSES = ("xlsx", "xls", "csv")
+
+
+def _has_supported_extension(filename: str, file_classes=DEFAULT_FILE_CLASSES) -> bool:
+    ext = os.path.splitext(filename)[1].lower().lstrip(".")
+    return ext in file_classes
+
+
+def _read_data_file(file_path: str) -> pd.DataFrame:
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".csv":
+        return pd.read_csv(file_path)
+    if ext in (".xls", ".xlsx"):
+        return pd.read_excel(file_path)
+    raise ValueError(
+        f"Unsupported file format: {ext}. Only .xls, .xlsx, and .csv are supported. "
+        f"File: {file_path}"
+    )
+
+
+def _strip_zero_width(s: str) -> str:
+    if not isinstance(s, str):
+        return s
+    return s.translate(
+        {0x200B: None, 0x200C: None, 0x200D: None, 0xFEFF: None}
+    )
+
+
+def _canonicalize_name(name: str) -> str:
+    name = unicodedata.normalize("NFKC", name)
+    name = _strip_zero_width(name)
+    return name
+
+
+def _normalize_for_compare(name: str) -> str:
+    n = _canonicalize_name(name)
+    n = n.replace("_", " ")
+    n = " ".join(n.split())
+    return n.lower()
+
+
+def _find_matching_file(folder: str, expected_name: str):
+    expected = _canonicalize_name(expected_name)
+    try:
+        entries = os.listdir(folder)
+    except FileNotFoundError:
+        return None
+    for f in entries:
+        if _canonicalize_name(f) == expected:
+            return f
+    expected_lower = expected.lower()
+    for f in entries:
+        if _canonicalize_name(f).lower() == expected_lower:
+            return f
+    expected_relaxed = _normalize_for_compare(expected_name)
+    for f in entries:
+        if _normalize_for_compare(f) == expected_relaxed:
+            return f
+    return None
+
+
+def _find_matching_file_by_label(folder: str, label_name, file_classes):
+    for ext in file_classes:
+        expected_name = f"{label_name}.{ext}"
+        match = _find_matching_file(folder, expected_name)
+        if match is not None:
+            return match
+    return None
+
+
+# ============================================================
+# 报告生成工具
+# ============================================================
+
+class ReportBuffer:
+    """收集报告行，并同时输出到 stdout 和文件。"""
+
+    def __init__(self, output_path=None):
+        self.lines: list[str] = []
+        self.output_path = output_path
+
+    def add(self, text: str = ""):
+        print(text)
+        self.lines.append(text)
+
+    def save(self):
+        if self.output_path:
+            with open(self.output_path, "w", encoding="utf-8") as f:
+                f.write("\n".join(self.lines) + "\n")
+            print(f"\n报告已保存到: {self.output_path}")
+
+
+# ============================================================
+# 核心检查逻辑
+# ============================================================
+
+def _extract_features(df: pd.DataFrame, source: str) -> pd.DataFrame:
+    """
+    按项目约定提取偶数列作为特征（保持 int 列名以对齐）。
+    返回特征 DataFrame（列名 0, 2, 4, ...）。
+    """
+    # 偶数列索引: 1, 3, 5, ...
+    even_cols = [c for i, c in enumerate(df.columns) if i % 2 == 1]
+    if not even_cols:
+        raise ValueError(f"没有找到偶数列（特征列）。请检查文件: {source}")
+    features = df[even_cols].copy()
+
+    # 尝试转为数值
+    for c in features.columns:
+        features[c] = pd.to_numeric(features[c], errors="coerce")
+
+    return features
+
+
+def check_tabular_project(root: str, folder: str, labels: list[str], rp: ReportBuffer):
+    """完整检查流程"""
+    data_dir = os.path.join(root, "Static", folder)
+    if not os.path.isdir(data_dir):
+        rp.add(f"[ERROR] 目录不存在: {data_dir}")
+        rp.add("请确认 --folder 参数正确。")
+        return
+
+    rp.add("=" * 64)
+    rp.add("  Deeplearning 数据质量检查报告")
+    rp.add("=" * 64)
+    rp.add(f"  数据目录  : {data_dir}")
+    rp.add(f"  标签数量  : {len(labels)}")
+    rp.add(f"  标签列表  : {labels}")
+    rp.add()
+
+    # ---- 第一步：检测数据模式 ----
+    has_all_subfolders = True
+    for lbl in labels:
+        sub = os.path.join(data_dir, str(lbl))
+        if not (os.path.isdir(sub) and any(_has_supported_extension(f) for f in os.listdir(sub))):
+            has_all_subfolders = False
+            break
+
+    has_all_files = True
+    for lbl in labels:
+        if _find_matching_file_by_label(data_dir, lbl, DEFAULT_FILE_CLASSES) is None:
+            has_all_files = False
+            break
+
+    if has_all_files and not has_all_subfolders:
+        mode = "single_file"
+    elif has_all_subfolders and not has_all_files:
+        mode = "multi_folder"
+    else:
+        rp.add("[ERROR] 无法自动检测数据模式，或两种模式同时存在。")
+        rp.add(f"  has_all_files    : {has_all_files}")
+        rp.add(f"  has_all_subfolders: {has_all_subfolders}")
+        rp.add("请确保每个 label 对应唯一的文件或唯一的子目录。")
+        return
+
+    if mode == "single_file":
+        _check_single_file_mode(data_dir, labels, rp)
+    else:
+        _check_multi_folder_mode(data_dir, labels, rp)
+
+    rp.add()
+    rp.add("=" * 64)
+    rp.add("  检查完成。")
+    rp.add("=" * 64)
+
+
+def _check_single_file_mode(data_dir: str, labels: list[str], rp: ReportBuffer):
+    rp.add()
+    rp.add("── 数据模式: 单文件模式 ──")
+    rp.add()
+
+    # 1. 定位实际文件名
+    file_map: dict[str, str] = {}
+    missing: list[str] = []
+    for lbl in labels:
+        match = _find_matching_file_by_label(data_dir, lbl, DEFAULT_FILE_CLASSES)
+        if match:
+            file_map[lbl] = match
+        else:
+            missing.append(lbl)
+
+    if missing:
+        rp.add(f"[WARN] 以下标签找不到对应文件: {missing}")
+        rp.add(f"当前目录内容: {sorted(os.listdir(data_dir))}")
+        if not file_map:
+            return
+        labels = [l for l in labels if l in file_map]
+
+    # 2. 逐类读取
+    all_features = []  # list of (label, pd.DataFrame)
+    per_class_info: dict[str, dict] = {}
+    col_counts: dict[str, int] = {}
+
+    for lbl in labels:
+        fname = file_map[lbl]
+        file_path = os.path.join(data_dir, fname)
+        info: dict[str, object] = {"label": lbl, "file": fname, "warnings": []}
+
+        try:
+            raw = _read_data_file(file_path)
+        except Exception as e:
+            info["error"] = str(e)
+            per_class_info[lbl] = info
+            rp.add(f"[ERROR] 读取文件失败: {file_path} — {e}")
+            continue
+
+        info["raw_rows"] = raw.shape[0]
+        info["raw_cols"] = raw.shape[1]
+
+        # NaN 在原始文件中
+        total_nan = raw.isna().sum().sum()
+        if total_nan > 0:
+            info["warnings"].append(f"原始文件含 {total_nan} 个 NaN 单元格")
+
+        try:
+            features = _extract_features(raw, fname)
+        except ValueError as e:
+            info["error"] = str(e)
+            per_class_info[lbl] = info
+            rp.add(f"[ERROR] 特征提取失败: {file_path} — {e}")
+            continue
+
+        # 丢弃含 NaN 的行（同 loadData 的 dropna 逻辑）后统计
+        clean = features.dropna()
+        info["feature_cols"] = features.shape[1]
+        info["samples_after_dropna"] = clean.shape[0]
+        info["dropped_nan_rows"] = features.shape[0] - clean.shape[0]
+        info["values"] = clean.values
+
+        col_counts[lbl] = features.shape[1]
+
+        if clean.shape[0] == 0:
+            info["warnings"].append("去除 NaN 后无有效样本")
+
+        per_class_info[lbl] = info
+        if clean.shape[0] > 0:
+            all_features.append((lbl, clean))
+
+    # 列数一致性
+    if len(set(col_counts.values())) > 1:
+        rp.add()
+        rp.add("[WARN] 各标签的特征列数不一致！")
+        for lbl, cc in col_counts.items():
+            rp.add(f"  {lbl}: {cc} 列")
+        rp.add("这会导致 load_data 时补零逻辑产生差异。")
+    else:
+        rp.add(f"[OK] 所有标签特征列数一致: {next(iter(col_counts.values()), 0)} 列")
+
+    # 样本数统计
+    rp.add()
+    rp.add("── 各类别样本数 ──")
+    sample_counts: dict[str, int] = {}
+    for lbl in labels:
+        info = per_class_info.get(lbl, {})
+        if "error" in info:
+            rp.add(f"  [{lbl}] 加载失败: {info['error']}")
+            continue
+        n = info.get("samples_after_dropna", 0)
+        sample_counts[lbl] = n
+        warnings = info.get("warnings", [])
+        wflag = f"  ⚠ {'; '.join(warnings)}" if warnings else ""
+        rp.add(f"  [{lbl}] {n} 行 (文件: {info.get('file','?')}, "
+               f"原始 {info.get('raw_rows','?')} 行, "
+               f"丢弃 NaN 行 {info.get('dropped_nan_rows',0)}){wflag}")
+
+    # 平衡性分析
+    _analyze_balance(sample_counts, rp)
+
+    # 统计 + 离群值
+    _analyze_statistics(all_features, rp)
+    _analyze_outliers(all_features, rp)
+
+
+def _check_multi_folder_mode(data_dir: str, labels: list[str], rp: ReportBuffer):
+    rp.add()
+    rp.add("── 数据模式: 多子特征模式 ──")
+    rp.add()
+
+    all_features = []
+    per_class_info: dict[str, dict] = {}
+    col_counts: dict[str, int] = {}
+
+    for lbl in labels:
+        sub = os.path.join(data_dir, str(lbl))
+        if not os.path.isdir(sub):
+            per_class_info[lbl] = {"error": f"子目录不存在: {sub}"}
+            rp.add(f"[ERROR] {lbl}: 子目录不存在")
+            continue
+
+        files = sorted(
+            [f for f in os.listdir(sub) if _has_supported_extension(f)]
+        )
+        if not files:
+            per_class_info[lbl] = {"error": f"子目录下无支持的文件: {sub}"}
+            rp.add(f"[ERROR] {lbl}: 子目录下无 .xlsx/.xls/.csv 文件")
+            continue
+
+        class_frame_list = []
+        single_file_cols = set()
+        total_raw = 0
+        total_dropped = 0
+        failed_files = []
+
+        for fname in files:
+            file_path = os.path.join(sub, fname)
+            try:
+                raw = _read_data_file(file_path)
+            except Exception as e:
+                failed_files.append(f"  {fname}: {e}")
+                continue
+
+            total_raw += raw.shape[0]
+            try:
+                features = _extract_features(raw, f"{lbl}/{fname}")
+            except ValueError as e:
+                failed_files.append(f"  {fname}: {e}")
+                continue
+
+            single_file_cols.add(features.shape[1])
+            clean = features.dropna()
+            total_dropped += features.shape[0] - clean.shape[0]
+            if clean.shape[0] > 0:
+                class_frame_list.append(clean)
+
+        info: dict[str, object] = {
+            "label": lbl,
+            "num_files": len(files),
+            "raw_rows_total": total_raw,
+            "dropped_nan_rows": total_dropped,
+            "warnings": [],
+        }
+
+        if failed_files:
+            info["warnings"].append(f"{len(failed_files)} 个文件加载失败")
+            for ff in failed_files:
+                rp.add(f"  [WARN] {ff}")
+
+        if len(single_file_cols) > 1:
+            info["warnings"].append(
+                f"子文件间列数不一致: {sorted(single_file_cols)}"
+            )
+            col_counts[lbl] = max(single_file_cols)
+        elif single_file_cols:
+            col_counts[lbl] = single_file_cols.pop()
+        else:
+            col_counts[lbl] = 0
+
+        if class_frame_list:
+            combined = pd.concat(class_frame_list, ignore_index=True)
+            info["samples_after_dropna"] = combined.shape[0]
+            info["feature_cols"] = combined.shape[1]
+            info["values"] = combined.values
+            all_features.append((lbl, combined))
+        else:
+            info["samples_after_dropna"] = 0
+            info["warnings"].append("无有效样本")
+
+        per_class_info[lbl] = info
+
+    # 列数一致性
+    non_zero = {l: c for l, c in col_counts.items() if c > 0}
+    if non_zero and len(set(non_zero.values())) > 1:
+        rp.add()
+        rp.add("[WARN] 各标签的特征列数不一致（将使用零填充对齐）:")
+        for lbl, cc in col_counts.items():
+            rp.add(f"  {lbl}: {cc} 列")
+    elif non_zero:
+        rp.add(f"[OK] 所有标签特征列数一致: {next(iter(non_zero.values()))} 列")
+
+    # 样本数统计
+    rp.add()
+    rp.add("── 各类别样本数 ──")
+    sample_counts: dict[str, int] = {}
+    for lbl in labels:
+        info = per_class_info.get(lbl, {})
+        if "error" in info:
+            rp.add(f"  [{lbl}] 加载失败: {info['error']}")
+            continue
+        n = info.get("samples_after_dropna", 0)
+        sample_counts[lbl] = n
+        wflag = ""
+        if info.get("warnings"):
+            wflag = f"  ⚠ {'; '.join(info['warnings'])}"
+        rp.add(f"  [{lbl}] {n} 行 "
+               f"(来自 {info.get('num_files','?')} 个文件, "
+               f"原始 {info.get('raw_rows_total','?')} 行, "
+               f"丢弃 NaN 行 {info.get('dropped_nan_rows',0)}){wflag}")
+
+    _analyze_balance(sample_counts, rp)
+    _analyze_statistics(all_features, rp)
+    _analyze_outliers(all_features, rp)
+
+
+# ============================================================
+# 分析子模块
+# ============================================================
+
+def _analyze_balance(counts: dict[str, int], rp: ReportBuffer):
+    rp.add()
+    rp.add("── 类别平衡性分析 ──")
+    if not counts:
+        rp.add("无有效样本，跳过。")
+        return
+
+    values = list(counts.values())
+    total = sum(values)
+    n_classes = len(values)
+    avg = total / n_classes if n_classes else 0
+    min_count = min(values)
+    max_count = max(values)
+
+    rp.add(f"  总样本数    : {total}")
+    rp.add(f"  类别数      : {n_classes}")
+    rp.add(f"  平均每类    : {avg:.1f}")
+    rp.add(f"  最少样本类  : {min(counts, key=counts.get)} ({min_count})")
+    rp.add(f"  最多样本类  : {max(counts, key=counts.get)} ({max_count})")
+
+    if min_count == 0:
+        rp.add("  [ERROR] 存在样本数为 0 的类别，训练将无法进行！")
+        return
+
+    ratio = max_count / min_count if min_count > 0 else float("inf")
+    rp.add(f"  不平衡比例  : {ratio:.2f}:1 (max/min)")
+
+    std_val = float(np.std(values))
+    cv = std_val / avg if avg > 0 else 0
+    rp.add(f"  变异系数(CV): {cv:.4f}")
+
+    if ratio > 5:
+        rp.add("  [WARN] 类别严重不平衡 (>5:1)，建议进行数据增强或使用类别权重。")
+    elif ratio > 3:
+        rp.add("  [WARN] 类别较不平衡 (>3:1)，可考虑采样策略。")
+    else:
+        rp.add("  [OK] 类别基本平衡。")
+
+    # 训练-测试划分预估
+    rp.add()
+    rp.add("  ── 训练/测试划分预估 (test_size=0.2, stratify) ──")
+    for lbl, cnt in sorted(counts.items()):
+        test_n = max(1, int(cnt * 0.2))
+        train_n = cnt - test_n
+        rp.add(f"    [{lbl}] 训练 {train_n} / 测试 {test_n} (总计 {cnt})")
+
+
+def _analyze_statistics(
+    all_features: list[tuple[str, pd.DataFrame]],
+    rp: ReportBuffer,
+):
+    rp.add()
+    rp.add("── 特征统计信息 ──")
+    if not all_features:
+        rp.add("无有效数据，跳过。")
+        return
+
+    # 全局特征统计（将各 array 零填充到相同列数）
+    max_cols = max(f[1].shape[1] for f in all_features)
+    padded_arrays = []
+    for _, df in all_features:
+        val = df.values
+        if val.shape[1] < max_cols:
+            pad = np.zeros((val.shape[0], max_cols - val.shape[1]), dtype=val.dtype)
+            val = np.hstack([val, pad])
+        padded_arrays.append(val)
+    all_values = np.vstack(padded_arrays)
+    rp.add(f"  全局特征维度: {all_values.shape} (样本数 × 特征数, 零填充对齐到 {max_cols} 列)")
+    rp.add(f"  全局均值    : {np.mean(all_values):.6f}")
+    rp.add(f"  全局标准差  : {np.std(all_values):.6f}")
+    rp.add(f"  全局最小值  : {np.min(all_values):.6f}")
+    rp.add(f"  全局最大值  : {np.max(all_values):.6f}")
+    rp.add(f"  全局中位数  : {np.median(all_values):.6f}")
+
+    # 每个特征维度的统计
+    rp.add()
+    n_cols = min(all_values.shape[1], 12)
+    rp.add(f"  ── 前 {n_cols} 个特征维度的分布 (均值 ± 标准差) ──")
+    for j in range(n_cols):
+        col = all_values[:, j]
+        rp.add(
+            f"    feature{j+1}: "
+            f"μ={np.mean(col):.4f}  σ={np.std(col):.4f}  "
+            f"[{np.min(col):.4f}, {np.max(col):.4f}]  "
+            f"med={np.median(col):.4f}"
+        )
+
+    # 每个类别的简要统计
+    rp.add()
+    rp.add("  ── 各类别特征统计 ──")
+    for lbl, df in all_features:
+        val = df.values
+        rp.add(
+            f"    [{lbl}] "
+            f"μ={np.mean(val):.4f}  σ={np.std(val):.4f}  "
+            f"范围 [{np.min(val):.4f}, {np.max(val):.4f}]"
+        )
+
+
+def _analyze_outliers(
+    all_features: list[tuple[str, pd.DataFrame]],
+    rp: ReportBuffer,
+):
+    rp.add()
+    rp.add("── 离群值检测 (基于 IQR) ──")
+    if not all_features:
+        rp.add("无有效数据，跳过。")
+        return
+
+    total_outlier_samples = 0
+    total_samples = 0
+
+    for lbl, df in all_features:
+        val = df.values
+        total_samples += val.shape[0]
+        q1 = np.percentile(val, 25, axis=0)
+        q3 = np.percentile(val, 75, axis=0)
+        iqr = q3 - q1
+        lower = q1 - 1.5 * iqr
+        upper = q3 + 1.5 * iqr
+
+        outlier_mask = np.any((val < lower) | (val > upper), axis=1)
+        n_outliers = int(np.sum(outlier_mask))
+        total_outlier_samples += n_outliers
+        pct = n_outliers / val.shape[0] * 100 if val.shape[0] > 0 else 0
+        status = "[OK]" if pct < 10 else "[WARN]" if pct < 25 else "[ERROR]"
+        rp.add(f"  [{lbl}] 离群样本: {n_outliers}/{val.shape[0]} ({pct:.1f}%) {status}")
+
+    overall_pct = total_outlier_samples / total_samples * 100 if total_samples > 0 else 0
+    rp.add()
+    rp.add(f"  整体离群比例: {total_outlier_samples}/{total_samples} ({overall_pct:.1f}%)")
+
+    if overall_pct > 20:
+        rp.add("  [WARN] 超过 20% 数据为离群值，请确认数据清洗是否正确。")
+    elif overall_pct > 10:
+        rp.add("  [INFO] 离群值比例偏高，训练时可能影响收敛。")
+    else:
+        rp.add("  [OK] 离群值比例正常。")
+
+
+# ============================================================
+# 命令行入口
+# ============================================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Deeplearning 数据质量检查脚本",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+    python Scripts/check_data.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9
+    python Scripts/check_data.py -f "20260408 grap" -l 1 2 3 4 5 6 7 8 9 -o report.txt
+    python Scripts/check_data.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9 -r /path/to/project
+""",
+    )
+    parser.add_argument(
+        "-f", "--folder",
+        required=True,
+        help="Static/ 下的数据目录名（例如 20260319Numbers）",
+    )
+    parser.add_argument(
+        "-l", "--labels",
+        nargs="+",
+        required=True,
+        help="类别标签列表，空格分隔（例如 0 1 2 3 4 或 A B C）",
+    )
+    parser.add_argument(
+        "-o", "--output",
+        default=None,
+        help="将报告保存到指定文件路径",
+    )
+    parser.add_argument(
+        "-r", "--root",
+        default=None,
+        help="项目根目录（默认为脚本的上级目录，即 Deeplearning/）",
+    )
+
+    args = parser.parse_args()
+
+    # 确定项目根目录
+    if args.root:
+        root = args.root
+    else:
+        script_dir = Path(__file__).resolve().parent
+        root = str(script_dir.parent)
+
+    root = os.path.abspath(root)
+
+    if not os.path.isdir(root):
+        print(f"[ERROR] 项目根目录不存在: {root}")
+        sys.exit(1)
+
+    rp = ReportBuffer(output_path=args.output)
+    check_tabular_project(
+        root=root,
+        folder=args.folder,
+        labels=args.labels,
+        rp=rp,
+    )
+    rp.save()
+
+
+if __name__ == "__main__":
+    main()
--- a/Scripts/visualize.py
+++ b/Scripts/visualize.py
@ -0,0 +1,657 @@
+#!/usr/bin/env python3
+"""
+数据可视化脚本
+===============
+加载 Static/ 下的数据目录，生成多种可视化图表：
+  - 类别分布柱状图
+  - 特征分布直方图（各类叠加）
+  - 特征箱线图（前 N 个特征）
+  - PCA 降维散点图 + 置信椭圆
+  - t-SNE 降维散点图
+  - 各类别均值/标准差对比热力图
+  - 全局特征相关性热力图
+  - 全局特征分布概览
+
+用法:
+    python Scripts/visualize.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9
+    python Scripts/visualize.py -f "20260408 grap" -l 1 2 3 4 5 6 7 8 9 --max-features 20
+    python Scripts/visualize.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9 --no-tsne
+
+输出目录: Visualizations/<folder>/
+"""
+
+import os
+import sys
+import argparse
+import unicodedata
+from pathlib import Path
+import warnings
+
+import numpy as np
+import pandas as pd
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+warnings.filterwarnings("ignore", category=FutureWarning)
+warnings.filterwarnings("ignore", category=UserWarning)
+
+# ============================================================
+# 数据加载工具（与 check_data.py 保持一致）
+# ============================================================
+
+DEFAULT_FILE_CLASSES = ("xlsx", "xls", "csv")
+
+
+def _has_supported_extension(filename: str, file_classes=DEFAULT_FILE_CLASSES) -> bool:
+    ext = os.path.splitext(filename)[1].lower().lstrip(".")
+    return ext in file_classes
+
+
+def _read_data_file(file_path: str) -> pd.DataFrame:
+    ext = os.path.splitext(file_path)[1].lower()
+    if ext == ".csv":
+        return pd.read_csv(file_path)
+    if ext in (".xls", ".xlsx"):
+        return pd.read_excel(file_path)
+    raise ValueError(f"Unsupported file format: {ext}: {file_path}")
+
+
+def _strip_zero_width(s: str) -> str:
+    if not isinstance(s, str):
+        return s
+    return s.translate({0x200B: None, 0x200C: None, 0x200D: None, 0xFEFF: None})
+
+
+def _canonicalize_name(name: str) -> str:
+    name = unicodedata.normalize("NFKC", name)
+    return _strip_zero_width(name)
+
+
+def _normalize_for_compare(name: str) -> str:
+    n = _canonicalize_name(name)
+    n = n.replace("_", " ")
+    n = " ".join(n.split())
+    return n.lower()
+
+
+def _find_matching_file(folder: str, expected_name: str):
+    expected = _canonicalize_name(expected_name)
+    try:
+        entries = os.listdir(folder)
+    except FileNotFoundError:
+        return None
+    for f in entries:
+        if _canonicalize_name(f) == expected:
+            return f
+    expected_lower = expected.lower()
+    for f in entries:
+        if _canonicalize_name(f).lower() == expected_lower:
+            return f
+    expected_relaxed = _normalize_for_compare(expected_name)
+    for f in entries:
+        if _normalize_for_compare(f) == expected_relaxed:
+            return f
+    return None
+
+
+def _find_matching_file_by_label(folder: str, label_name, file_classes):
+    for ext in file_classes:
+        expected_name = f"{label_name}.{ext}"
+        match = _find_matching_file(folder, expected_name)
+        if match is not None:
+            return match
+    return None
+
+
+def _extract_features(df: pd.DataFrame, source: str) -> pd.DataFrame:
+    even_cols = [c for i, c in enumerate(df.columns) if i % 2 == 1]
+    if not even_cols:
+        raise ValueError(f"没有找到偶数列（特征列）。请检查文件: {source}")
+    features = df[even_cols].copy()
+    for c in features.columns:
+        features[c] = pd.to_numeric(features[c], errors="coerce")
+    return features
+
+
+# ============================================================
+# 数据加载
+# ============================================================
+
+def load_all_data(root: str, folder: str, labels: list[str]):
+    data_dir = os.path.join(root, "Static", folder)
+    if not os.path.isdir(data_dir):
+        print(f"[ERROR] 目录不存在: {data_dir}")
+        sys.exit(1)
+
+    has_all_files = all(
+        _find_matching_file_by_label(data_dir, lbl, DEFAULT_FILE_CLASSES) is not None
+        for lbl in labels
+    )
+    has_all_subfolders = all(
+        os.path.isdir(os.path.join(data_dir, str(lbl)))
+        and any(_has_supported_extension(f) for f in os.listdir(os.path.join(data_dir, str(lbl))))
+        for lbl in labels
+    )
+
+    if has_all_files and not has_all_subfolders:
+        return _load_single_file_mode(data_dir, labels)
+    elif has_all_subfolders and not has_all_files:
+        return _load_multi_folder_mode(data_dir, labels)
+    else:
+        print("[WARN] 数据模式不明确，尝试单文件模式...")
+        return _load_single_file_mode(data_dir, labels)
+
+
+def _load_single_file_mode(data_dir: str, labels: list[str]):
+    all_features = []
+    col_counts = {}
+    label_names = []
+
+    for lbl in labels:
+        fname = _find_matching_file_by_label(data_dir, lbl, DEFAULT_FILE_CLASSES)
+        if fname is None:
+            print(f"[WARN] 标签 {lbl} 找不到文件，跳过")
+            continue
+        file_path = os.path.join(data_dir, fname)
+        try:
+            raw = _read_data_file(file_path)
+        except Exception as e:
+            print(f"[ERROR] 读取 {file_path} 失败: {e}")
+            continue
+        try:
+            features = _extract_features(raw, fname)
+        except ValueError as e:
+            print(f"[ERROR] {e}")
+            continue
+
+        clean = features.dropna()
+        if clean.shape[0] == 0:
+            print(f"[WARN] 标签 {lbl} 去除 NaN 后无样本，跳过")
+            continue
+
+        col_counts[lbl] = clean.shape[1]
+        all_features.append((lbl, clean))
+        label_names.append(lbl)
+
+    return _build_arrays(all_features, label_names, col_counts)
+
+
+def _load_multi_folder_mode(data_dir: str, labels: list[str]):
+    all_features = []
+    col_counts = {}
+    label_names = []
+
+    for lbl in labels:
+        sub = os.path.join(data_dir, str(lbl))
+        if not os.path.isdir(sub):
+            print(f"[WARN] 标签 {lbl} 子目录不存在，跳过")
+            continue
+
+        files = sorted([f for f in os.listdir(sub) if _has_supported_extension(f)])
+        if not files:
+            print(f"[WARN] 标签 {lbl} 子目录无文件，跳过")
+            continue
+
+        frames = []
+        max_cols_in_class = 0
+        for fname in files:
+            file_path = os.path.join(sub, fname)
+            try:
+                raw = _read_data_file(file_path)
+            except Exception as e:
+                print(f"[WARN] 读取 {file_path} 失败: {e}")
+                continue
+            try:
+                feat = _extract_features(raw, f"{lbl}/{fname}")
+            except ValueError as e:
+                print(f"[WARN] {e}")
+                continue
+            clean = feat.dropna()
+            if clean.shape[0] > 0:
+                frames.append(clean)
+                max_cols_in_class = max(max_cols_in_class, clean.shape[1])
+
+        if not frames:
+            print(f"[WARN] 标签 {lbl} 无有效样本，跳过")
+            continue
+
+        padded_frames = []
+        for f in frames:
+            if f.shape[1] < max_cols_in_class:
+                pad = np.zeros((f.shape[0], max_cols_in_class - f.shape[1]))
+                padded = pd.DataFrame(
+                    np.hstack([f.values, pad]),
+                    columns=list(f.columns) + [f"_pad_{i}" for i in range(max_cols_in_class - f.shape[1])],
+                )
+                padded_frames.append(padded)
+            else:
+                padded_frames.append(f)
+
+        combined = pd.concat(padded_frames, ignore_index=True)
+        col_counts[lbl] = combined.shape[1]
+        all_features.append((lbl, combined))
+        label_names.append(lbl)
+
+    return _build_arrays(all_features, label_names, col_counts)
+
+
+def _build_arrays(all_features, label_names, col_counts):
+    if not all_features:
+        print("[ERROR] 没有加载到任何有效数据")
+        sys.exit(1)
+
+    max_cols = max(c for c in col_counts.values())
+
+    X_list = []
+    y_list = []
+    for idx, (lbl, df) in enumerate(all_features):
+        val = df.values
+        if val.shape[1] < max_cols:
+            pad = np.zeros((val.shape[0], max_cols - val.shape[1]), dtype=val.dtype)
+            val = np.hstack([val, pad])
+        X_list.append(val)
+        y_list.append(np.full(val.shape[0], idx, dtype=int))
+
+    X = np.vstack(X_list)
+    y = np.concatenate(y_list)
+
+    return X, y, label_names, all_features, col_counts
+
+
+# ============================================================
+# 可视化函数
+# ============================================================
+
+TAB10 = plt.cm.tab10.colors
+
+
+def _ensure_dir(path: str):
+    os.makedirs(path, exist_ok=True)
+
+
+def plot_class_distribution(y, label_names, out_dir: str):
+    fig, ax = plt.subplots(figsize=(max(8, len(label_names) * 0.6), 5))
+    counts = [int(np.sum(y == i)) for i in range(len(label_names))]
+    colors = [TAB10[i % 10] for i in range(len(label_names))]
+    bars = ax.bar(label_names, counts, color=colors, edgecolor="white", linewidth=0.8)
+    for bar, cnt in zip(bars, counts):
+        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + max(counts) * 0.01,
+                str(cnt), ha="center", va="bottom", fontsize=9)
+    ax.set_xlabel("类别")
+    ax.set_ylabel("样本数")
+    ax.set_title(f"类别分布 (总计 {sum(counts)} 样本, {len(label_names)} 类)")
+    fig.tight_layout()
+    path = os.path.join(out_dir, "01_class_distribution.png")
+    fig.savefig(path, dpi=150)
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def plot_feature_histograms(all_features, out_dir: str, max_features: int = 12):
+    n_features = min(max_features, all_features[0][1].shape[1])
+    n_cols = 4
+    n_rows = (n_features + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(4 * n_cols, 3 * n_rows))
+    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
+
+    colors = [TAB10[i % 10] for i in range(len(all_features))]
+
+    for j in range(n_features):
+        ax = axes[j]
+        for idx, (lbl, df) in enumerate(all_features):
+            if j < df.shape[1]:
+                col = df.iloc[:, j].values
+                ax.hist(col, bins=40, density=True, alpha=0.4, color=colors[idx],
+                        label=f"类 {lbl}")
+        ax.set_title(f"Feature {j+1}")
+        ax.set_xlabel("值")
+        ax.set_ylabel("密度")
+        if n_features <= 8:
+            ax.legend(fontsize=7, loc="upper right")
+
+    for j in range(n_features, len(axes)):
+        axes[j].set_visible(False)
+
+    if n_features > 8:
+        handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i], alpha=0.4)
+                   for i in range(len(all_features))]
+        fig.legend(handles, [lbl for lbl, _ in all_features],
+                   loc="lower center", ncol=min(10, len(all_features)), fontsize=7)
+
+    fig.suptitle(f"特征分布直方图（各类叠加，前 {n_features} 维）", fontsize=13, y=1.01)
+    fig.tight_layout()
+    path = os.path.join(out_dir, "02_feature_histograms.png")
+    fig.savefig(path, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def plot_feature_boxplots(all_features, out_dir: str, max_features: int = 20):
+    n_features = min(max_features, all_features[0][1].shape[1])
+    n_cols = 4
+    n_rows = (n_features + n_cols - 1) // n_cols
+
+    fig, axes = plt.subplots(n_rows, n_cols, figsize=(4.5 * n_cols, 3.5 * n_rows))
+    axes = axes.flatten() if n_rows * n_cols > 1 else [axes]
+
+    for j in range(n_features):
+        ax = axes[j]
+        data_list = []
+        positions = []
+        labels_for_box = []
+        for idx, (lbl, df) in enumerate(all_features):
+            if j < df.shape[1]:
+                data_list.append(df.iloc[:, j].values)
+                positions.append(idx + 1)
+                labels_for_box.append(str(lbl))
+
+        bp = ax.boxplot(data_list, positions=positions, labels=labels_for_box,
+                         patch_artist=True, widths=0.6, showfliers=True,
+                         flierprops={"marker": ".", "markersize": 2, "alpha": 0.3})
+        for patch, idx in zip(bp["boxes"], range(len(data_list))):
+            patch.set_facecolor(TAB10[idx % 10])
+            patch.set_alpha(0.6)
+        ax.set_title(f"Feature {j+1}")
+        ax.set_xlabel("类别")
+        ax.tick_params(axis="x", rotation=0, labelsize=8)
+
+    for j in range(n_features, len(axes)):
+        axes[j].set_visible(False)
+
+    fig.suptitle(f"特征箱线图（各类别对比，前 {n_features} 维）", fontsize=13, y=1.01)
+    fig.tight_layout()
+    path = os.path.join(out_dir, "03_feature_boxplots.png")
+    fig.savefig(path, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def _plot_confidence_ellipse(ax, mean, cov, color, alpha=0.2, n_std=1.0):
+    from matplotlib.patches import Ellipse
+    vals, vecs = np.linalg.eigh(cov)
+    order = vals.argsort()[::-1]
+    vals = vals[order]
+    vecs = vecs[:, order]
+    angle = np.degrees(np.arctan2(vecs[1, 0], vecs[0, 0]))
+    width, height = 2 * n_std * np.sqrt(vals)
+    ellipse = Ellipse(xy=mean, width=width, height=height, angle=angle,
+                      facecolor=color, alpha=alpha, edgecolor=color, linewidth=0.8)
+    ax.add_patch(ellipse)
+
+
+def plot_pca(X, y, label_names, out_dir: str):
+    from sklearn.decomposition import PCA
+
+    pca = PCA(n_components=2, random_state=42)
+    X_pca = pca.fit_transform(X)
+
+    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
+    colors = [TAB10[i % 10] for i in range(len(label_names))]
+
+    # 散点图
+    ax = axes[0]
+    for i, lbl in enumerate(label_names):
+        mask = y == i
+        ax.scatter(X_pca[mask, 0], X_pca[mask, 1], c=[colors[i]], label=f"类 {lbl}",
+                   alpha=0.5, s=3, edgecolors="none")
+    ax.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
+    ax.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
+    ax.set_title("PCA 降维散点图")
+    ax.legend(fontsize=7, markerscale=3, loc="best")
+
+    # 质心+椭圆
+    ax2 = axes[1]
+    for i, lbl in enumerate(label_names):
+        mask = y == i
+        class_points = X_pca[mask]
+        mean = class_points.mean(axis=0)
+        ax2.scatter(mean[0], mean[1], c=[colors[i]], s=80, marker="X",
+                    edgecolors="black", linewidths=0.8, zorder=5)
+        ax2.annotate(str(lbl), (mean[0], mean[1]), fontsize=8, ha="center", va="bottom",
+                     fontweight="bold", xytext=(0, 4), textcoords="offset points")
+        if class_points.shape[0] > 2:
+            cov = np.cov(class_points.T)
+            _plot_confidence_ellipse(ax2, mean, cov, color=colors[i], alpha=0.25)
+
+    ax2.set_xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
+    ax2.set_ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
+    ax2.set_title("PCA 质心 + 1σ 置信椭圆")
+
+    fig.suptitle("PCA 降维分析", fontsize=14)
+    fig.tight_layout()
+    path = os.path.join(out_dir, "04_pca.png")
+    fig.savefig(path, dpi=150)
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+    # 方差解释率
+    fig2, ax = plt.subplots(figsize=(8, 4))
+    n = min(30, len(pca.explained_variance_ratio_))
+    cumsum = np.cumsum(pca.explained_variance_ratio_[:n])
+    ax.bar(range(1, n + 1), pca.explained_variance_ratio_[:n],
+           alpha=0.6, color="steelblue", label="个体")
+    ax.plot(range(1, n + 1), cumsum, "ro-", markersize=4, label="累计")
+    ax.set_xlabel("主成分")
+    ax.set_ylabel("方差解释率")
+    ax.set_title("PCA 方差解释率")
+    ax.legend()
+    fig2.tight_layout()
+    path2 = os.path.join(out_dir, "04_pca_variance.png")
+    fig2.savefig(path2, dpi=150)
+    plt.close(fig2)
+    print(f"[OK] {path2}")
+
+
+def plot_tsne(X, y, label_names, out_dir: str, max_samples: int = 5000):
+    from sklearn.manifold import TSNE
+
+    if X.shape[0] > max_samples:
+        print(f"[INFO] t-SNE: 样本过多 ({X.shape[0]}), 分层抽样至 {max_samples}")
+        indices = []
+        per_class = max_samples // len(label_names)
+        for i in range(len(label_names)):
+            idx_i = np.where(y == i)[0]
+            if len(idx_i) <= per_class:
+                indices.extend(idx_i.tolist())
+            else:
+                rng = np.random.RandomState(42)
+                indices.extend(rng.choice(idx_i, per_class, replace=False).tolist())
+        indices = np.array(indices)
+        X_sub = X[indices]
+        y_sub = y[indices]
+    else:
+        X_sub = X
+        y_sub = y
+
+    print("[INFO] 正在计算 t-SNE（可能需要一些时间）...")
+    perplexity = min(50, max(5, X_sub.shape[0] // 3))
+    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity,
+                n_iter=1000, verbose=0)
+    X_tsne = tsne.fit_transform(X_sub)
+
+    colors = [TAB10[i % 10] for i in range(len(label_names))]
+
+    fig, ax = plt.subplots(figsize=(10, 8))
+    for i, lbl in enumerate(label_names):
+        mask = y_sub == i
+        ax.scatter(X_tsne[mask, 0], X_tsne[mask, 1], c=[colors[i]], label=f"类 {lbl}",
+                   alpha=0.5, s=3, edgecolors="none")
+    ax.set_xlabel("t-SNE 1")
+    ax.set_ylabel("t-SNE 2")
+    ax.set_title(f"t-SNE 降维散点图 (n={X_sub.shape[0]}, perplexity={perplexity})")
+    ax.legend(fontsize=7, markerscale=3, loc="best")
+    fig.tight_layout()
+    path = os.path.join(out_dir, "05_tsne.png")
+    fig.savefig(path, dpi=150)
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def plot_class_mean_std_heatmap(all_features, label_names, out_dir: str, max_features: int = 30):
+    n_features = min(max_features, all_features[0][1].shape[1])
+    n_classes = len(label_names)
+
+    mean_matrix = np.zeros((n_classes, n_features))
+    std_matrix = np.zeros((n_classes, n_features))
+
+    for i, (lbl, df) in enumerate(all_features):
+        for j in range(min(n_features, df.shape[1])):
+            col = df.iloc[:, j].values
+            mean_matrix[i, j] = np.mean(col)
+            std_matrix[i, j] = np.std(col)
+        for j in range(df.shape[1], n_features):
+            mean_matrix[i, j] = 0.0
+            std_matrix[i, j] = 0.0
+
+    fig, axes = plt.subplots(1, 2, figsize=(max(10, n_features * 0.35), max(5, n_classes * 0.5)))
+
+    sns.heatmap(mean_matrix, ax=axes[0], cmap="RdBu_r", center=0,
+                xticklabels=[f"F{i+1}" for i in range(n_features)] if n_features <= 30 else False,
+                yticklabels=label_names,
+                annot=n_features <= 20, fmt=".3f" if n_features <= 20 else "",
+                linewidths=0.5, cbar_kws={"label": "均值", "shrink": 0.8})
+    axes[0].set_title("各类别特征均值")
+    axes[0].set_xlabel("特征维度")
+
+    sns.heatmap(std_matrix, ax=axes[1], cmap="YlOrRd",
+                xticklabels=[f"F{i+1}" for i in range(n_features)] if n_features <= 30 else False,
+                yticklabels=label_names,
+                annot=n_features <= 20, fmt=".3f" if n_features <= 20 else "",
+                linewidths=0.5, cbar_kws={"label": "标准差", "shrink": 0.8})
+    axes[1].set_title("各类别特征标准差")
+    axes[1].set_xlabel("特征维度")
+
+    fig.suptitle("各类别特征统计对比", fontsize=13)
+    fig.tight_layout()
+    path = os.path.join(out_dir, "06_class_mean_std_heatmap.png")
+    fig.savefig(path, dpi=150, bbox_inches="tight")
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def plot_correlation_heatmap(X, out_dir: str, max_features: int = 30):
+    n_features = min(max_features, X.shape[1])
+    X_sub = X[:, :n_features]
+
+    corr = np.corrcoef(X_sub.T)
+
+    fig, ax = plt.subplots(figsize=(max(10, n_features * 0.5), max(8, n_features * 0.45)))
+    sns.heatmap(corr, ax=ax, cmap="RdBu_r", center=0, vmin=-1, vmax=1,
+                xticklabels=[f"F{i+1}" for i in range(n_features)] if n_features <= 30 else False,
+                yticklabels=[f"F{i+1}" for i in range(n_features)] if n_features <= 30 else False,
+                linewidths=0.1, cbar_kws={"label": "Pearson r", "shrink": 0.8})
+    ax.set_title(f"特征相关性矩阵 (前 {n_features} 维)")
+    fig.tight_layout()
+    path = os.path.join(out_dir, "07_correlation_heatmap.png")
+    fig.savefig(path, dpi=150)
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+def plot_global_distribution(X, out_dir: str):
+    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
+
+    all_vals = X.flatten()
+    axes[0].hist(all_vals, bins=100, color="steelblue", alpha=0.7, edgecolor="white",
+                 linewidth=0.3)
+    axes[0].axvline(np.mean(all_vals), color="red", linestyle="--",
+                    label=f"均值={np.mean(all_vals):.4f}")
+    axes[0].axvline(np.median(all_vals), color="orange", linestyle="--",
+                    label=f"中位数={np.median(all_vals):.4f}")
+    axes[0].set_xlabel("特征值")
+    axes[0].set_ylabel("频数")
+    axes[0].set_title("全局特征值分布")
+    axes[0].legend()
+
+    means = np.mean(X, axis=0)
+    stds = np.std(X, axis=0)
+    n_features = min(X.shape[1], 50)
+    axes[1].errorbar(range(1, n_features + 1), means[:n_features], yerr=stds[:n_features],
+                     fmt="o", markersize=3, capsize=2, color="steelblue", alpha=0.7)
+    axes[1].set_xlabel("特征维度")
+    axes[1].set_ylabel("均值 ± 标准差")
+    axes[1].set_title(f"各维度均值与标准差 (前 {n_features} 维)")
+
+    fig.suptitle("全局特征概览", fontsize=13)
+    fig.tight_layout()
+    path = os.path.join(out_dir, "08_global_distribution.png")
+    fig.savefig(path, dpi=150)
+    plt.close(fig)
+    print(f"[OK] {path}")
+
+
+# ============================================================
+# 命令行入口
+# ============================================================
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Deeplearning 数据可视化脚本",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+示例:
+    python Scripts/visualize.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9
+    python Scripts/visualize.py -f "20260408 grap" -l 1 2 3 4 5 6 7 8 9
+    python Scripts/visualize.py -f 20260319Numbers -l 0 1 2 3 4 5 6 7 8 9 --no-tsne --max-features 15
+""",
+    )
+    parser.add_argument("-f", "--folder", required=True,
+                        help="Static/ 下的数据目录名")
+    parser.add_argument("-l", "--labels", nargs="+", required=True,
+                        help="类别标签列表，空格分隔")
+    parser.add_argument("-r", "--root", default=None,
+                        help="项目根目录（默认为 Deeplearning/）")
+    parser.add_argument("--max-features", type=int, default=20,
+                        help="可视化中显示的最大特征维度数 (默认 20)")
+    parser.add_argument("--no-tsne", action="store_true",
+                        help="跳过 t-SNE 计算")
+    parser.add_argument("--no-pca", action="store_true",
+                        help="跳过 PCA 计算")
+    parser.add_argument("--tsne-max-samples", type=int, default=5000,
+                        help="t-SNE 最大抽样数 (默认 5000)")
+
+    args = parser.parse_args()
+
+    if args.root:
+        root = args.root
+    else:
+        root = str(Path(__file__).resolve().parent.parent)
+    root = os.path.abspath(root)
+
+    print(f"加载数据: {root}/Static/{args.folder}")
+    X, y, label_names, all_features, col_counts = load_all_data(root, args.folder, args.labels)
+    print(f"  样本数: {X.shape[0]}, 特征维度: {X.shape[1]}, 类别数: {len(label_names)}")
+    for lbl in label_names:
+        cnt = int(np.sum(y == label_names.index(lbl)))
+        print(f"    类 {lbl}: {cnt} 样本, {col_counts.get(lbl, X.shape[1])} 列")
+
+    out_dir = os.path.join(root, "Visualizations", args.folder)
+    _ensure_dir(out_dir)
+    print(f"\n输出目录: {out_dir}\n")
+
+    plt.rcParams["font.family"] = "sans-serif"
+    plt.rcParams["font.sans-serif"] = ["DejaVu Sans"]
+
+    print("生成可视化图表...\n")
+    plot_class_distribution(y, label_names, out_dir)
+    plot_feature_histograms(all_features, out_dir,
+                             max_features=min(args.max_features, X.shape[1]))
+    plot_feature_boxplots(all_features, out_dir,
+                           max_features=min(args.max_features, X.shape[1]))
+    if not args.no_pca:
+        plot_pca(X, y, label_names, out_dir)
+    if not args.no_tsne:
+        plot_tsne(X, y, label_names, out_dir, max_samples=args.tsne_max_samples)
+    plot_class_mean_std_heatmap(all_features, label_names, out_dir,
+                                 max_features=min(args.max_features, X.shape[1]))
+    plot_correlation_heatmap(X, out_dir, max_features=min(30, X.shape[1]))
+    plot_global_distribution(X, out_dir)
+
+    print(f"\n全部图表已保存到: {out_dir}")
+
+
+if __name__ == "__main__":
+    main()
--- a/main.py
+++ b/main.py
@ -69,11 +69,11 @@ def _save_yaml(file_path, data):

 def main():
  # 输入元数据文件夹名称
-  projet_name = '20260409 grap'
+  projet_name = '20260512 Graps'
  # 请在[]内输入每一个分类的名称
-  # label_names 是一个列表里面按顺序包含了小写的‘a'到‘z’
-  label_names = list(range(1, 10))
-  hidden_layers = [256, 128, 128, 128]
+
+  label_names = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I']  # label_names是大写的A-I
+  hidden_layers = [256, 256]
  test_size = 0.5
  dropout_rate = 0
  epochs = 300