特征构建-多项式展开的特征生成(sklearn实现和pyspark实现)
在现有数据中挑选或将现有数据进行变换,组合形成新特征,此过程称为特征构建。
机器学习项目中,通常会给一定的特征数据进行分类或者回归预测。有时需要构建更多的特征,然后对特征再进行特征选择。通过增加一些输入数据的非线性特征来增加模型的复杂度通常是有效的。
一个简单通用的办法是使用多项式特征,这可以获得特征的更高次数项和交互项。
1. sklearn实现
# coding=utf-8 # 多项式展开特征生成 # class sklearn.preprocessing.PolynomialFeatures(degree=2, *, interaction_only=False, include_bias=True, order='C') import numpy as np from sklearn.preprocessing import PolynomialFeatures X = np.arange(6).reshape(3, 2) print('X: ') print(X)
# 默认参数: # interaction_only是否只包含交互项 # include_bias是否包含截距项 poly_1 = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True) poly_model = poly_1.fit(X) X_1 = poly_model.transform(X) print('X_1: ') print(X_1)
poly_2 = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) X_2 = poly_2.fit_transform(X) print('X_2: ') print(X_2)
poly_3 = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False) X_3 = poly_3.fit_transform(X) print('X_3: ') print(X_3)
2. pyspark实现
# coding=utf-8 # class pyspark.ml.feature.PolynomialExpansion(*, degree=2, inputCol=None, outputCol=None) from pyspark.sql import SparkSession from pyspark.ml.feature import VectorAssembler from pyspark.ml.feature import PolynomialExpansion if __name__ == '__main__': spark = SparkSession.builder.appName('polynomial').getOrCreate() data = spark.createDataFrame(([1, 2, 1, 2], [2, 4, 3, 4], [3, 6, 5, 6], [4, 8, 7, 8], [5, 10, 9, 10], [6, 12, 11, 12]), ['a', 'b', 'c', 'd']) data.show()
columns_t = ['a', 'b', 'c', 'd'] df_assembler = VectorAssembler(inputCols=columns_t, outputCol='features') data = df_assembler.transform(data) data.show() # 多项式展开特征生成 pe_model = PolynomialExpansion(degree=2, inputCol='features', outputCol='polynomial_features') data = pe_model.transform(data) data.show()