91 lines
2.7 KiB
Python
91 lines
2.7 KiB
Python
|
import os
|
||
|
import pandas as pd
|
||
|
|
||
|
STATIC_PATH = './Static'
|
||
|
|
||
|
def dLoader(folder, label_names=None):
|
||
|
|
||
|
"""
|
||
|
Load data from Excel files in a specified folder.
|
||
|
|
||
|
Args:
|
||
|
folder (str): Name of the folder containing Excel files.
|
||
|
label_names (list): Optional list of label names. If not provided, file names will be used.
|
||
|
|
||
|
Returns:
|
||
|
pandas.DataFrame: Loaded and processed data.
|
||
|
"""
|
||
|
|
||
|
folder_path = os.path.join(STATIC_PATH, folder)
|
||
|
file_names = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
|
||
|
|
||
|
|
||
|
if not label_names:
|
||
|
label_names = [f.split('.')[0] for f in file_names]
|
||
|
|
||
|
max_row_length = get_max_row_len(folder_path, file_names)
|
||
|
|
||
|
all_features = []
|
||
|
for i, file_name in enumerate(file_names):
|
||
|
features = load_xlsx(os.path.join(folder_path, file_name), label_names[i], max_row_length)
|
||
|
all_features.append(features)
|
||
|
|
||
|
return pd.concat(all_features, ignore_index=True)
|
||
|
|
||
|
def load_xlsx(file_name, label_name, max_row_length, fill_rule='mean'):
|
||
|
"""
|
||
|
Load and process data from a single Excel file.
|
||
|
|
||
|
Args:
|
||
|
file_name (str): Path to the Excel file.
|
||
|
label_name (str): Label for the data in this file.
|
||
|
max_row_length (int): Maximum number of rows to consider.
|
||
|
fill_rule (str): Rule for filling missing values ('min', 'mean', or None).
|
||
|
|
||
|
Returns:
|
||
|
pandas.DataFrame: Processed data from the Excel file.
|
||
|
"""
|
||
|
df = pd.read_excel(file_name)
|
||
|
features = df.iloc[0:, 1::2]
|
||
|
features.dropna(inplace=True)
|
||
|
features.reset_index(drop=True, inplace=True)
|
||
|
features = features.T
|
||
|
features = features.apply(lambda row: fill_to_len(row, max_row_length, fill_rule), axis=1)
|
||
|
|
||
|
features['label'] = label_name
|
||
|
features.columns = [f'feature{i+1}' for i in range(max_row_length)] + ['label']
|
||
|
|
||
|
return features
|
||
|
|
||
|
def fill_to_len(row, length=1000, rule=None):
|
||
|
"""
|
||
|
Fill a row to a specified length.
|
||
|
|
||
|
Args:
|
||
|
row (pandas.Series): Row to fill.
|
||
|
length (int): Desired length of the row.
|
||
|
rule (str): Rule for filling ('min', 'mean', or None).
|
||
|
|
||
|
Returns:
|
||
|
pandas.Series: Filled row.
|
||
|
"""
|
||
|
fill_value = 0
|
||
|
if rule == 'min':
|
||
|
fill_value = row.min()
|
||
|
elif rule == 'mean':
|
||
|
fill_value = row.mean()
|
||
|
fill_values = pd.Series([fill_value] * (length - len(row)))
|
||
|
return pd.concat([row, fill_values], ignore_index=True)
|
||
|
|
||
|
def get_max_row_len(folder, filenames):
|
||
|
"""
|
||
|
Get the maximum row length across all Excel files in a folder.
|
||
|
|
||
|
Args:
|
||
|
folder (str): Path to the folder containing Excel files.
|
||
|
filenames (list): List of Excel file names.
|
||
|
|
||
|
Returns:
|
||
|
int: Maximum row length.
|
||
|
"""
|
||
|
return max(pd.read_excel(os.path.join(folder, filename)).shape[0] for filename in filenames)
|