Deeplearning/remake/Qtorch/Functions/dataLoader.py

import os
import pandas as pd

STATIC_PATH = './Static'

def dLoader(folder, label_names=None):

    """
    Load data from Excel files in a specified folder.
    
    Args:
    folder (str): Name of the folder containing Excel files.
    label_names (list): Optional list of label names. If not provided, file names will be used.
    
    Returns:
    pandas.DataFrame: Loaded and processed data.
    """

    folder_path = os.path.join(STATIC_PATH, folder)
    file_names = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]
    

    if not label_names:
        label_names = [f.split('.')[0] for f in file_names]

    max_row_length = get_max_row_len(folder_path, file_names)

    all_features = []
    for i, file_name in enumerate(file_names):
        features = load_xlsx(os.path.join(folder_path, file_name), label_names[i], max_row_length)
        all_features.append(features)

    return pd.concat(all_features, ignore_index=True)

def load_xlsx(file_name, label_name, max_row_length, fill_rule='mean'):
    """
    Load and process data from a single Excel file.
    
    Args:
    file_name (str): Path to the Excel file.
    label_name (str): Label for the data in this file.
    max_row_length (int): Maximum number of rows to consider.
    fill_rule (str): Rule for filling missing values ('min', 'mean', or None).
    
    Returns:
    pandas.DataFrame: Processed data from the Excel file.
    """
    df = pd.read_excel(file_name)
    features = df.iloc[0:, 1::2]
    features.dropna(inplace=True)  
    features.reset_index(drop=True, inplace=True)
    features = features.T
    features = features.apply(lambda row: fill_to_len(row, max_row_length, fill_rule), axis=1)
    
    features['label'] = label_name
    features.columns = [f'feature{i+1}' for i in range(max_row_length)] + ['label']
    
    return features

def fill_to_len(row, length=1000, rule=None):
    """
    Fill a row to a specified length.
    
    Args:
    row (pandas.Series): Row to fill.
    length (int): Desired length of the row.
    rule (str): Rule for filling ('min', 'mean', or None).
    
    Returns:
    pandas.Series: Filled row.
    """
    fill_value = 0
    if rule == 'min':
        fill_value = row.min()
    elif rule == 'mean':
        fill_value = row.mean()
    fill_values = pd.Series([fill_value] * (length - len(row)))
    return pd.concat([row, fill_values], ignore_index=True)

def get_max_row_len(folder, filenames):
    """
    Get the maximum row length across all Excel files in a folder.
    
    Args:
    folder (str): Path to the folder containing Excel files.
    filenames (list): List of Excel file names.
    
    Returns:
    int: Maximum row length.
    """
    return max(pd.read_excel(os.path.join(folder, filename)).shape[0] for filename in filenames)
init 2024-10-07 09:54:32 +08:00			`import os`
			`import pandas as pd`

			`STATIC_PATH = './Static'`

			`def dLoader(folder, label_names=None):`

			`"""`
			`Load data from Excel files in a specified folder.`

			`Args:`
			`folder (str): Name of the folder containing Excel files.`
			`label_names (list): Optional list of label names. If not provided, file names will be used.`

			`Returns:`
			`pandas.DataFrame: Loaded and processed data.`
			`"""`

			`folder_path = os.path.join(STATIC_PATH, folder)`
			`file_names = [f for f in os.listdir(folder_path) if f.endswith('.xlsx')]`


			`if not label_names:`
			`label_names = [f.split('.')[0] for f in file_names]`

			`max_row_length = get_max_row_len(folder_path, file_names)`

			`all_features = []`
			`for i, file_name in enumerate(file_names):`
			`features = load_xlsx(os.path.join(folder_path, file_name), label_names[i], max_row_length)`
			`all_features.append(features)`

			`return pd.concat(all_features, ignore_index=True)`

			`def load_xlsx(file_name, label_name, max_row_length, fill_rule='mean'):`
			`"""`
			`Load and process data from a single Excel file.`

			`Args:`
			`file_name (str): Path to the Excel file.`
			`label_name (str): Label for the data in this file.`
			`max_row_length (int): Maximum number of rows to consider.`
			`fill_rule (str): Rule for filling missing values ('min', 'mean', or None).`

			`Returns:`
			`pandas.DataFrame: Processed data from the Excel file.`
			`"""`
			`df = pd.read_excel(file_name)`
			`features = df.iloc[0:, 1::2]`
			`features.dropna(inplace=True)`
			`features.reset_index(drop=True, inplace=True)`
			`features = features.T`
			`features = features.apply(lambda row: fill_to_len(row, max_row_length, fill_rule), axis=1)`

			`features['label'] = label_name`
			`features.columns = [f'feature{i+1}' for i in range(max_row_length)] + ['label']`

			`return features`

			`def fill_to_len(row, length=1000, rule=None):`
			`"""`
			`Fill a row to a specified length.`

			`Args:`
			`row (pandas.Series): Row to fill.`
			`length (int): Desired length of the row.`
			`rule (str): Rule for filling ('min', 'mean', or None).`

			`Returns:`
			`pandas.Series: Filled row.`
			`"""`
			`fill_value = 0`
			`if rule == 'min':`
			`fill_value = row.min()`
			`elif rule == 'mean':`
			`fill_value = row.mean()`
			`fill_values = pd.Series([fill_value] * (length - len(row)))`
			`return pd.concat([row, fill_values], ignore_index=True)`

			`def get_max_row_len(folder, filenames):`
			`"""`
			`Get the maximum row length across all Excel files in a folder.`

			`Args:`
			`folder (str): Path to the folder containing Excel files.`
			`filenames (list): List of Excel file names.`

			`Returns:`
			`int: Maximum row length.`
			`"""`
			`return max(pd.read_excel(os.path.join(folder, filename)).shape[0] for filename in filenames)`