Deeplearning/remake/Qtorch/Functions/dataSplitter.py

37 lines
1.2 KiB
Python
Raw Normal View History

2024-10-07 09:54:32 +08:00
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
def dsplit(data, labels=None, test_size=0.2, random_state=None):
"""
Split the dataset into training and testing sets.
Args:
data (pandas.DataFrame): Input data.
labels (list): Optional list of labels.
test_size (float): Proportion of the dataset to include in the test split.
random_state (int): Random state for reproducibility.
Returns:
tuple: X_train, X_test, y_train, y_test, encoded_labels
"""
encoder = LabelEncoder()
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
if labels is not None:
encoded_labels = encoder.fit_transform(labels)
else:
encoder.fit(y)
encoded_labels = None
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
y_train = encoder.transform(y_train.values.ravel())
y_test = encoder.transform(y_test.values.ravel())
return X_train, X_test, y_train, y_test, encoded_labels