Refactor data saving and visualization functions; add metrics tracking in Qnn model

2025-06-24 23:10:53 +08:00 · 2025-06-24 23:10:53 +08:00 · 61e4af020d
parent 9b5c0eadf3
commit 61e4af020d
4 changed files with 220 additions and 30 deletions
--- a/Qfunctions/loaData.py
+++ b/Qfunctions/loaData.py
@ -62,14 +62,13 @@ def load_xlsx(fileName, labelName, max_row_length = 1000, fill_rule = None):
    # 提取偶数列
    features = df.iloc[0:, 1::2]
    # 复制 features DataFrame
-    features_copy = features.copy()
+    # features_copy = features.copy()
    # 使用 pd.concat 来追加副本到原始 DataFrame
-    features = pd.concat([features, features_copy], ignore_index=True, axis=1)
+    # features = pd.concat([features, features_copy], ignore_index=True, axis=1)

    # 计算变化率
    # first_value = features.iloc[0, :]  # 获取第一行的数据
    # features_pct_change = (features - first_value) / first_value
-
    # features = features_pct_change

    features.dropna(inplace=True)  
@ -77,11 +76,15 @@ def load_xlsx(fileName, labelName, max_row_length = 1000, fill_rule = None):

    features = features.T

-     # 补全每一行到指定长度
+    # 补全每一行到指定长度
    features = features.apply(lambda row: fill_to_len(row, max_row_length, fill_rule), axis=1)
    
+    # 获取实际的列数
+    actual_columns = features.shape[1]
+    
    features['label'] = labelName
-    features.columns = [f'feature{i+1}' for i in range(max_row_length)] + ['label']
+    # 使用实际的列数来创建列名
+    features.columns = [f'feature{i+1}' for i in range(actual_columns)] + ['label']
    
    return features
    
--- a/Qfunctions/saveToxlsx.py
+++ b/Qfunctions/saveToxlsx.py
@ -1,7 +1,173 @@
 import os
+import pandas as pd
+import matplotlib.pyplot as plt
+from mpl_toolkits.mplot3d import Axes3D
+import numpy as np

 def save_to_xlsx(project_name, file_name, data):
-    os.makedirs(f'Result/{project_name}', exist_ok=True)
-    data.to_excel(f'Result/{project_name}/{file_name}.xlsx', index=True)
-    print("Save successed to " + f'Result/{project_name}/{file_name}.xlsx')
+    folder_path = f'Result/{project_name}'
+    os.makedirs(folder_path, exist_ok=True)
+    data.to_excel(f'{folder_path}/{file_name}.xlsx', index=True)
+    print("Save successed to " + f'{folder_path}/{file_name}.xlsx')
+    save_to_pic(project_name=project_name, file_name=file_name)
    return
+
+def save_to_pic(project_name, file_name):
+    os.makedirs(f'Result/{project_name}', exist_ok=True)
+    if file_name == 'pca_2d':
+        draw_pca_2d(f'Result/{project_name}/{file_name}.xlsx')
+        print("Save successed to " + f'Result/{project_name}/{file_name}.png')
+    elif file_name == 'pca_3d':
+        draw_pca_3d(f'Result/{project_name}/{file_name}.xlsx')
+        print("Save successed to " + f'Result/{project_name}/{file_name}.png')
+    elif file_name == 'acc_and_loss':
+        draw_epoch_data(f'Result/{project_name}/{file_name}.xlsx')
+        draw_last_epoch_bar_chart(f'Result/{project_name}/{file_name}.xlsx')
+        print("Save successed to line graph and bar graph")
+    elif file_name == 'cm':
+        draw_and_save_cm(f'Result/{project_name}/{file_name}.xlsx')
+        print("Save successed cm")
+    elif file_name == 'cmn':
+        draw_and_save_cm(f'Result/{project_name}/{file_name}.xlsx')
+        print("Save successed cmn")
+    else:
+        print("unknow picture type")
+
+
+def draw_pca_2d(file_path):
+    df = pd.read_excel(file_path)
+    plt.figure(figsize=(8, 6))
+    plt.scatter(df['PC1'], df['PC2'], c=df['labels'], cmap='viridis', edgecolor='k', alpha=0.6)
+    plt.xlabel('PC1')
+    plt.ylabel('PC2')
+    plt.title('2D PCA')
+    plt.colorbar(label='Labels')
+    plt.savefig(file_path.replace('.xlsx', '.png'))
+    plt.close()
+
+def draw_pca_3d(file_path):
+    df = pd.read_excel(file_path)
+    fig = plt.figure(figsize=(8, 6))
+    ax = fig.add_subplot(111, projection='3d')
+    scatter = ax.scatter(df['PC1'], df['PC2'], df['PC3'], c=df['labels'], cmap='viridis', edgecolor='k', alpha=0.6)
+    ax.set_xlabel('PC1')
+    ax.set_ylabel('PC2')
+    ax.set_zlabel('PC3')
+    ax.set_title('3D PCA')
+    fig.colorbar(scatter, ax=ax, label='Labels')
+    plt.savefig(file_path.replace('.xlsx', '.png'))
+
+def draw_epoch_data(file_path):
+    df = pd.read_excel(file_path)
+    epochs = df['epoch']
+    train_loss = df['train_loss']
+    train_accuracy = df['train_accuracy'] * 100
+    test_accuracy = df['test_accuracy'] * 100
+    f1_score = df['f1_score']
+    precision = df['precision']
+    recall = df['recall']
+
+    fig, axs = plt.subplots(2, 3, figsize=(18, 12))
+
+    # 折线图：训练损失
+    axs[0, 0].plot(epochs, train_loss, 'b-', label='Train Loss')
+    axs[0, 0].set_xlabel('Epoch')
+    axs[0, 0].set_ylabel('Loss')
+    axs[0, 0].set_title('Training Loss over Epochs')
+    axs[0, 0].legend()
+
+    # 折线图：训练准确率和测试准确率
+    axs[0, 1].plot(epochs, train_accuracy, 'g-', label='Train Accuracy')
+    axs[0, 1].plot(epochs, test_accuracy, 'r-', label='Test Accuracy')
+    axs[0, 1].set_xlabel('Epoch')
+    axs[0, 1].set_ylabel('Accuracy (%)')
+    axs[0, 1].set_title('Train and Test Accuracy over Epochs')
+    axs[0, 1].legend()
+
+    # 折线图：F1 Score
+    axs[0, 2].plot(epochs, f1_score, 'm-', label='F1 Score')
+    axs[0, 2].set_xlabel('Epoch')
+    axs[0, 2].set_ylabel('F1 Score')
+    axs[0, 2].set_title('F1 Score over Epochs')
+    axs[0, 2].legend()
+
+    # 折线图：Precision
+    axs[1, 0].plot(epochs, precision, 'c-', label='Precision')
+    axs[1, 0].set_xlabel('Epoch')
+    axs[1, 0].set_ylabel('Precision')
+    axs[1, 0].set_title('Precision over Epochs')
+    axs[1, 0].legend()
+
+    # 折线图：Recall
+    axs[1, 1].plot(epochs, recall, 'y-', label='Recall')
+    axs[1, 1].set_xlabel('Epoch')
+    axs[1, 1].set_ylabel('Recall')
+    axs[1, 1].set_title('Recall over Epochs')
+    axs[1, 1].legend()
+
+    # 空白或额外的图表空间（如果需要）
+    axs[1, 2].axis('off')
+
+    plt.tight_layout()
+    plt.savefig(file_path.replace('.xlsx', '_epoch.png'))
+    plt.close() 
+
+def draw_last_epoch_bar_chart(file_path):
+    df = pd.read_excel(file_path)
+    last_epoch_data = df.iloc[-1]
+
+    metrics = ['train_loss', 'train_accuracy', 'test_accuracy', 'f1_score', 'precision', 'recall']
+    values = [last_epoch_data[metric] for metric in metrics]
+    labels = ['Train Loss', 'Train Accuracy', 'Test Accuracy', 'F1 Score', 'Precision', 'Recall']
+
+    # 调整数值格式
+    values[1] *= 100  # Train Accuracy
+    values[2] *= 100  # Test Accuracy
+
+    plt.figure(figsize=(10, 6))
+    plt.bar(labels, values, color=['blue', 'green', 'red', 'magenta', 'cyan', 'yellow'])
+    plt.xlabel('Metrics')
+    plt.ylabel('Values')
+    plt.title('Last Epoch Metrics')
+    plt.ylim(bottom=0)
+
+    # 添加数值标签
+    for i, value in enumerate(values):
+        plt.text(i, value + 0.01, f'{value:.2f}', ha='center')
+
+    plt.tight_layout()
+    plt.savefig(file_path.replace('.xlsx', '_last_epoch_bar.png'))
+    plt.close()
+
+def draw_and_save_cm(file_path):
+    # 读取 Excel 文件
+    df_cm = pd.read_excel(file_path)
+
+    # 获取标签（假设 DataFrame 的列为类别标签）
+    labels = df_cm.columns[1:].tolist()
+
+    # 获取混淆矩阵和归一化混淆矩阵的数值
+    cm = df_cm.values[:, 1:]
+
+    # 创建一个图像和子图
+    fig, axs = plt.subplots(1, 2, figsize=(12, 6))
+
+    # 绘制普通混淆矩阵
+    axs[0].imshow(cm, interpolation='nearest', cmap='Blues')
+    axs[0].set_title('Confusion Matrix')
+    axs[0].set_xlabel('Predicted')
+    axs[0].set_ylabel('True')
+    axs[0].set_xticks(np.arange(len(labels)))
+    axs[0].set_yticks(np.arange(len(labels)))
+    axs[0].set_xticklabels(labels)
+    axs[0].set_yticklabels(labels)
+
+    # 添加数值标签
+    for i in range(len(labels)):
+        for j in range(len(labels)):
+            axs[0].text(j, i, f'{cm[i, j]}', ha='center', va='center')
+
+    # 调整布局并保存图像
+    plt.tight_layout()
+    plt.savefig(file_path.replace('.xlsx', '.png'))
+    plt.close()
--- a/Qtorch/Models/Qnn.py
+++ b/Qtorch/Models/Qnn.py
@ -2,7 +2,7 @@ import torch
 import torch.nn as nn
 import pandas as pd
 from sklearn.decomposition import PCA
-from sklearn.metrics import confusion_matrix
+from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
 from torch.utils.data import DataLoader, TensorDataset

 # from Qfunctions.divSet import divSet as ds
@ -27,20 +27,22 @@ class Qnn(nn.Module):
      'epoch': [],
      'train_loss': [],
      'train_accuracy': [],
-      'test_accuracy': []
+      'test_accuracy': [],
+      'precision': [],
+      'recall': [], 
+      'f1_score': []
    }
    
    self.pca_2d, self.pca_3d = None, None
-
    
  def __prepare_data(self):

    # 将data转换为tensor形式
-    X_train_tensor = torch.tensor(self.X_train, dtype=torch.float32).unsqueeze(1)
+    X_train_tensor = torch.tensor(self.X_train, dtype=torch.float32)
    self.y_train = self.LABEL_ENCODER.fit_transform(self.y_train)
    y_train_tensor = torch.tensor(self.y_train, dtype=torch.long)

-    X_test_tensor = torch.tensor(self.X_test, dtype=torch.float32).unsqueeze(1)
+    X_test_tensor = torch.tensor(self.X_test, dtype=torch.float32)
    self.y_test = self.LABEL_ENCODER.transform(self.y_test)
    y_test_tensor = torch.tensor(self.y_test, dtype=torch.long)

@ -57,7 +59,7 @@ class Qnn(nn.Module):
    model = self.to(self.DEVICE)

    criterion = nn.CrossEntropyLoss()
-    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.00001, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
    best_test_accuracy = 0
    patience = 100
@ -106,12 +108,24 @@ class Qnn(nn.Module):
              all_prob.extend(prob.cpu().numpy())

      test_accuracy = correct_test / total_test
-      print(f'Epoch [{epoch+1}/{epochs_times}], Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy * 100:.2f}%, Test Accuracy: {test_accuracy*100:.2f}%')
+      f1 = f1_score(all_labels, all_predicted, average='macro')
+      precision = precision_score(all_labels, all_predicted, average='macro')
+      recall = recall_score(all_labels, all_predicted, average='macro')
+
+      if (epoch + 1) % 10 == 0:
+        print('===============================================')
+        print(f'Epoch [{epoch + 1} / {epochs_times}]:')
+        print(f'Train Accuracy: {train_accuracy * 100:.2f}%, Test Accuracy: {test_accuracy*100:.2f}%, Loss: {train_loss:.4f}')
+        print(f'Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score:{f1:.4f}, ')
+        print('===============================================')
      
      self.epoch_data['epoch'].append(epoch+1)
      self.epoch_data['train_loss'].append(train_loss)
      self.epoch_data['train_accuracy'].append(train_accuracy)
      self.epoch_data['test_accuracy'].append(test_accuracy)
+      self.epoch_data['precision'].append(precision)
+      self.epoch_data['recall'].append(recall)
+      self.epoch_data['f1_score'].append(f1)

      scheduler.step(train_loss)
    
@ -125,7 +139,9 @@ class Qnn(nn.Module):
          print(f"Early stopping at epoch {epoch+1}")
          break

-    self.cm = confusion_matrix(all_labels, all_predicted, normalize='true')
+    self.cm  = confusion_matrix(all_labels, all_predicted)
+    self.cmn = confusion_matrix(all_labels, all_predicted, normalize='true')
+
    print(self.cm)
    return

@ -147,11 +163,14 @@ class Qnn(nn.Module):
    principalComponents = pca_3d.fit_transform(self.X_train)
    df_pca3d = pd.DataFrame(data=principalComponents, columns=['PC1', 'PC2', 'PC3'])
    df_pca3d['labels'] = self.y_train
-    
+
    return df_pca2d, df_pca3d

  def get_cm(self):
    return pd.DataFrame(self.cm, columns=self.labels, index=self.labels)
+  
+  def get_cmn(self):
+    return pd.DataFrame(self.cm, columns=self.labels, index=self.labels)

  def get_epoch_data(self):
    return pd.DataFrame(self.epoch_data)
--- a/main.py
+++ b/main.py
@ -1,40 +1,42 @@
-from Qtorch.Models.Qcnn import QCNN
+from Qtorch.Models.Qmlp import Qmlp
 from Qfunctions.divSet import divSet
 from Qfunctions.loaData import load_data
 from Qfunctions.saveToxlsx import save_to_xlsx as save_to_xlsx

 def main():
  # 输入元数据文件夹名称
-  projet_name = '20241228 Write'                                                                
+  projet_name = '20250623 FHH-write'                                                                
  # 请在[]内输入每一个分类的名称
-  label_names = ['I', 'L', 'O', 'V', 'E', 'F', 'J', 'U', 'T']                                   
+  label_names = ['5', '2', '0', 'M', 'J', 'U']
  print(label_names)
  data = load_data(projet_name, label_names, isDir=False, fileClass='xlsx')
  X_train, X_test, y_train, y_test, encoder = divSet(
    data=data, labels=label_names, test_size= 0.3
  )
  
-  # model = Qmlp(
-  #   X_train=X_train, X_test=X_test, y_train=y_train, y_test= y_test,
-  #   hidden_layers = [128],
-  #   dropout_rate=0
-  #   )
+  model = Qmlp(
+    X_train=X_train, X_test=X_test, y_train=y_train, y_test= y_test,
+    hidden_layers = [16],
+    dropout_rate=0      
+    )

-  model = QCNN(
-     X_train=X_train, X_test=X_test, y_train=y_train, y_test= y_test,
-     dropout_rate=0
-  )
+  # model = QCNN(
+  #    X_train=X_train, X_test=X_test, y_train=y_train, y_test= y_test,
+  #    dropout_rate=0
+  # )
  
  pca_2d, pca_3d = model.get_PCA()

  model.fit(300)
  
  cm = model.get_cm()
+  cmn = model.get_cmn()
  epoch_data = model.get_epoch_data()

  save_to_xlsx(project_name=projet_name, file_name="pca_2d", data=pca_2d)
  save_to_xlsx(project_name=projet_name, file_name="pca_3d", data=pca_3d)
-  save_to_xlsx(project_name=projet_name, file_name="cm", data=cm )
+  save_to_xlsx(project_name=projet_name, file_name="cm", data=cm)
+  save_to_xlsx(project_name=projet_name, file_name="cmn", data=cmn)
  save_to_xlsx(project_name=projet_name, file_name="acc_and_loss", data=epoch_data)

  print("Done")