from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder def divSet(data, labels=None, test_size=0.2, random_state=None): """Split data, scale features, and encode labels. This module is the canonical location for dataset splitting utilities. """ encoder = LabelEncoder() # 最后一列是标签 X = data.iloc[:, :-1] y = data.iloc[:, -1] if labels: encoder.fit(labels) else: encoder.fit(y) # 优先使用分层抽样,尽量保证每个类别在训练集和测试集都出现。 stratify_target = y if y.nunique() > 1 else None try: X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=stratify_target ) except ValueError: # 当样本过少等情况下分层失败,回退到普通随机划分。 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state ) # 标准化特征 scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # 编码标签 y_train = encoder.transform(y_train.values) y_test = encoder.transform(y_test.values) return X_train, X_test, y_train, y_test, encoder __all__ = ["divSet"]