37 lines
1.2 KiB
Python
37 lines
1.2 KiB
Python
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
|
def dsplit(data, labels=None, test_size=0.2, random_state=None):
|
|
"""
|
|
Split the dataset into training and testing sets.
|
|
|
|
Args:
|
|
data (pandas.DataFrame): Input data.
|
|
labels (list): Optional list of labels.
|
|
test_size (float): Proportion of the dataset to include in the test split.
|
|
random_state (int): Random state for reproducibility.
|
|
|
|
Returns:
|
|
tuple: X_train, X_test, y_train, y_test, encoded_labels
|
|
"""
|
|
encoder = LabelEncoder()
|
|
|
|
X = data.iloc[:, :-1]
|
|
y = data.iloc[:, -1]
|
|
|
|
if labels is not None:
|
|
encoded_labels = encoder.fit_transform(labels)
|
|
else:
|
|
encoder.fit(y)
|
|
encoded_labels = None
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
|
|
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
y_train = encoder.transform(y_train.values.ravel())
|
|
y_test = encoder.transform(y_test.values.ravel())
|
|
|
|
return X_train, X_test, y_train, y_test, encoded_labels |