46 lines
1.3 KiB
Python
46 lines
1.3 KiB
Python
from sklearn.model_selection import train_test_split
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
|
|
|
def divSet(data, labels=None, test_size=0.2, random_state=None):
|
|
"""Split data, scale features, and encode labels.
|
|
|
|
This module is the canonical location for dataset splitting utilities.
|
|
"""
|
|
encoder = LabelEncoder()
|
|
|
|
# 最后一列是标签
|
|
X = data.iloc[:, :-1]
|
|
y = data.iloc[:, -1]
|
|
|
|
if labels:
|
|
encoder.fit(labels)
|
|
else:
|
|
encoder.fit(y)
|
|
|
|
# 优先使用分层抽样,尽量保证每个类别在训练集和测试集都出现。
|
|
stratify_target = y if y.nunique() > 1 else None
|
|
try:
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=test_size, random_state=random_state, stratify=stratify_target
|
|
)
|
|
except ValueError:
|
|
# 当样本过少等情况下分层失败,回退到普通随机划分。
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y, test_size=test_size, random_state=random_state
|
|
)
|
|
|
|
# 标准化特征
|
|
scaler = StandardScaler()
|
|
X_train = scaler.fit_transform(X_train)
|
|
X_test = scaler.transform(X_test)
|
|
|
|
# 编码标签
|
|
y_train = encoder.transform(y_train.values)
|
|
y_test = encoder.transform(y_test.values)
|
|
|
|
return X_train, X_test, y_train, y_test, encoder
|
|
|
|
|
|
__all__ = ["divSet"]
|