Deeplearning/remake/Qtorch/Functions/dataLoader.py
2024-10-07 09:54:32 +08:00

91 lines
2.7 KiB
Python

import os
import pandas as pd
STATIC_PATH = './Static'
def dLoader(folder, label_names=None):
"""
Load data from Excel files in a specified folder.
Args:
folder (str): Name of the folder containing Excel files.
label_names (list): Optional list of label names. If not provided, file names will be used.
Returns:
pandas.DataFrame: Loaded and processed data.
"""
folder_path = os.path.join(STATIC_PATH, folder)
file_names = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
if not label_names:
label_names = [f.split('.')[0] for f in file_names]
max_row_length = get_max_row_len(folder_path, file_names)
all_features = []
for i, file_name in enumerate(file_names):
features = load_xlsx(os.path.join(folder_path, file_name), label_names[i], max_row_length)
all_features.append(features)
return pd.concat(all_features, ignore_index=True)
def load_xlsx(file_name, label_name, max_row_length, fill_rule='mean'):
"""
Load and process data from a single Excel file.
Args:
file_name (str): Path to the Excel file.
label_name (str): Label for the data in this file.
max_row_length (int): Maximum number of rows to consider.
fill_rule (str): Rule for filling missing values ('min', 'mean', or None).
Returns:
pandas.DataFrame: Processed data from the Excel file.
"""
df = pd.read_excel(file_name)
features = df.iloc[0:, 1::2]
features.dropna(inplace=True)
features.reset_index(drop=True, inplace=True)
features = features.T
features = features.apply(lambda row: fill_to_len(row, max_row_length, fill_rule), axis=1)
features['label'] = label_name
features.columns = [f'feature{i+1}' for i in range(max_row_length)] + ['label']
return features
def fill_to_len(row, length=1000, rule=None):
"""
Fill a row to a specified length.
Args:
row (pandas.Series): Row to fill.
length (int): Desired length of the row.
rule (str): Rule for filling ('min', 'mean', or None).
Returns:
pandas.Series: Filled row.
"""
fill_value = 0
if rule == 'min':
fill_value = row.min()
elif rule == 'mean':
fill_value = row.mean()
fill_values = pd.Series([fill_value] * (length - len(row)))
return pd.concat([row, fill_values], ignore_index=True)
def get_max_row_len(folder, filenames):
"""
Get the maximum row length across all Excel files in a folder.
Args:
folder (str): Path to the folder containing Excel files.
filenames (list): List of Excel file names.
Returns:
int: Maximum row length.
"""
return max(pd.read_excel(os.path.join(folder, filename)).shape[0] for filename in filenames)