本文主要从股市数据变量的特征分布及特征重要性两个角度对数据进行分析。通过绘制图表等方法分析特征本身对分布状况或特征间相互关系。通过机器学习模型方法分析出特种重要性排序,选出对结果贡献较大对那几个特征,这对后面建模对模型效果有着不可小觑对效果。
df['H-L'] = df['High'] - df['Low'] df['O-C'] = df['Adj Close'] - df['Open'] df['3day MA'] = df['Adj Close'].shift(1).rolling(window=3).mean() df['10day MA'] = df['Adj Close'].shift(1).rolling(window=10).mean() df['30day MA'] = df['Adj Close'].shift(1).rolling(window=30).mean() df['Std_dev'] = df['Adj Close'].rolling(5).std() df.dtypes
df.describe().T
df.isnull().sum()
Open 0 High 0 Low 0 Close 0 Adj Close 0 Volume 0 H-L 0 O-C 0 3day MA 3 10day MA 10 30day MA 30 Std_dev 4 dtype: int64
df_missing_count = df.isnull().sum() # -1表示缺失数据 # 另一个不常见的设置画布的方法 plt.rcParams['figure.figsize'] = (15,8) df_missing_count.plot.bar() plt.show()
for column in df: print("column nunique NaN") print("{0:15} {1:6d} {2:6}".format( column, df[column].nunique(), (df[column] == -1).sum()))
column nunique NaN Open 1082 0 High 1083 0 Low 1025 0 Close 1098 0 Adj Close 1173 0 Volume 1250 0 H-L 357 0 O-C 1237 2 3day MA 1240 0 10day MA 1244 0 30day MA 1230 0 Std_dev 1252 0
import seaborn as sns # 一个设置色板的方法 # cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(df.iloc[:df.shape[0]].corr() ,annot = True, cmap = 'Blues')
columns_multi = [x for x in list(df.columns)] df.hist(layout = (3,4), column = columns_multi) # 一种不常用的调整画布大小的方法 fig=plt.gcf() fig.set_size_inches(20,9)
names = columns_multi df.plot(kind='density', subplots=True, layout=(3,4), sharex=False)
sns.pairplot(df, size=3, diag_kind="kde")
from sklearn.feature_selection import RFE,RFECV, f_regression from sklearn.linear_model import (LinearRegression, Ridge, Lasso,LarsCV) from stability_selection import StabilitySelection, RandomizedLasso from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVR
df = df.dropna() Y = df['Adj Close'].values X = df.values colnames = df.columns # 定义字典来存储的排名 ranks = {} # 创建函数,它将特征排名存储到rank字典中 def ranking(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform( order*np.array([ranks]).T).T[0] ranks = map(lambda x: round(x,2), ranks) res = dict(zip(names, ranks)) return res
# 使用线性回归 lr = LinearRegression(normalize=True) lr.fit(X,Y) ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames) # 使用 Ridge ridge = Ridge(alpha = 7) ridge.fit(X,Y) ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames) # 使用 Lasso lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames)
feature_importances_
属性。X_1 = dataset[['Open', 'High', 'Low', 'Volume', 'Increase_Decrease','Buy_Sell_on_Open', 'Buy_Sell', 'Returns']] y_1 = dataset['Adj Close'] # 创建决策树分类器对象 clf = RandomForestRegressor(random_state=0, n_jobs=-1) # 训练模型 model = clf.fit(X_1, y_1) # 计算特征重要性 importances = model.feature_importances_ # 按降序排序特性的重要性 indices = np.argsort(importances)[::-1] # 重新排列特性名称,使它们与已排序的特性重要性相匹配 names = [dataset.columns[i] for i in indices] # 创建画布 plt.figure(figsize=(10,6)) # 添加标题 plt.title("Feature Importance") # 添加柱状图 plt.bar(range(X.shape[1]), importances[indices]) # 为x轴添加特征名 plt.xticks(range(X.shape[1]), names, rotation=90)
feature_importances_
属性。X2 = dataset[['Open', 'High', 'Low','Adj Close', 'Volume', 'Buy_Sell_on_Open', 'Buy_Sell', 'Returns']] y2 = dataset['Increase_Decrease'] clf = RandomForestClassifier(random_state=0, n_jobs=-1) model = clf.fit(X2, y2) importances = model.feature_importances_ indices = np.argsort(importances)[::-1] names = [dataset.columns[i] for i in indices] plt.figure(figsize=(10,6)) plt.title("Feature Importance") plt.bar(range(X2.shape[1]), importances[indices]) plt.xticks(range(X2.shape[1]), names, rotation=90) plt.show()
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3) rf.fit(X,Y) ranks["RF"] = ranking(rf.feature_importances_, colnames); 下面介绍两个顶层特征选择算法,之所以叫做顶层,是因为他们都是建立在基于模型的特征选择方法基础之上的,例如回归和SVM,在不同的子集上建立模型,然后汇总最终确定特征得分。
lambda_grid = np.linspace(0.001, 0.5, num=100) rlasso = RandomizedLasso(alpha=0.04) selector = StabilitySelection(base_estimator=rlasso, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(X, Y) # 运行随机Lasso的选择稳定性方法 ranks["rlasso/Stability"] = ranking(np.abs(selector.stability_scores_.max(axis=1)), colnames) print('finished')
{'Open': 1.0, 'High': 1.0, 'Low': 0.76, 'Close': 1.0, 'Adj Close': 0.99, 'Volume': 0.0, 'H-L': 0.0, 'O-C': 1.0, '3day MA': 1.0, '10day MA': 0.27, '30day MA': 0.75, 'Std_dev': 0.0} finished
fig, ax = plot_stability_path(selector) fig.set_size_inches(15,6) fig.show()
# 获取所选特征的掩码或整数索引 selected_variables = selector.get_support(indices=True) selected_scores = selector.stability_scores_.max(axis=1) print('Selected variables are:') print('-----------------------') for idx, (variable, score) in enumerate( zip(selected_variables, selected_scores[selected_variables])): print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))
Selected variables are: ----------------------- Variable 1: [0], score 1.000 Variable 2: [1], score 1.000 Variable 3: [3], score 1.000 Variable 4: [4], score 0.990 Variable 5: [7], score 1.000 Variable 6: [8], score 1.000
sklearn.feature_selection.RFE(estimator, *, n_features_to_select=None, step=1, verbose=0, importance_getter='auto')
estimator Estimator instance 一种带有""拟合""方法的监督学评估器,它提供关于特征重要性的信息(例如"coef_"、"feature_importances_")。n_features_to_select int or float, default=None 要选择的功能的数量。如果'None',则选择一半的特性。如果为整数,则该参数为要选择的特征的绝对数量。如果浮点数在0和1之间,则表示要选择的特征的分数。step int or float, default=1 如果大于或等于1,那么'step'对应于每次迭代要删除的(整数)特征数。如果在(0.0,1.0)范围内,则'step'对应于每次迭代中要删除的特性的百分比(向下舍入)。verbose int, default=0 控制输出的冗长。importance_getter str or callable, default='auto' 如果是'auto',则通过估计器的'coef_'或'feature_importances_'属性使用特征重要性。
lr = LinearRegression(normalize=True) lr.fit(X,Y) # 当且仅当剩下最后一个特性时停止搜索 rfe = RFE(lr, n_features_to_select=1, verbose =3) rfe.fit(X,Y) ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
Fitting estimator with 12 features. ... Fitting estimator with 2 features.
RFE
包,可以用于特征消除,还提供了 RFECV
,可以通过交叉验证来对的特征进行排序。# 实例化估计器和特征选择器 svr_mod = SVR(kernel="linear") rfecv = RFECV(svr_mod, cv=5) # 训练模型 rfecv.fit(X, Y) ranks["RFECV"] = ranking(list(map(float, rfecv.ranking_)), colnames, order=-1) # Print support and ranking print(rfecv.support_) print(rfecv.ranking_) print(X.columns)
# 删除第二步中不重要的特征 # X = X.drop('sex', axis=1) # 实例化 larscv = LarsCV(cv=5, normalize=False) # 训练模型 larscv.fit(X, Y) ranks["LarsCV"] = ranking(list(map(float, larscv.ranking_)), colnames, order=-1) # 输出r方和估计alpha值 print(larscv.score(X, Y)) print(larscv.alpha_) 以上是两个交叉验证,在对特征重要性要求高时可以使用。因运行时间有点长,这里大家可以自行运行得到结果。
r = {} for name in colnames: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(" %s" % " ".join(methods)) for name in colnames: print("%s %s" % (name, " ".join(map(str, [ranks[method][name] for method in methods]))))
LassoLinRegRFRFERidgerlasso/StabilityMean Open1.01.00.020.910.471.00.73 High0.140.00.10.360.061.00.28 Low0.020.00.080.730.050.760.27 Close0.140.00.640.550.321.00.44 Adj Close0.021.01.00.821.00.990.8 Volume0.00.00.00.00.00.00.0 H-L0.00.00.00.450.010.00.08 O-C0.851.00.01.00.531.00.73 3day MA0.00.00.00.270.011.00.21 10day MA0.00.00.020.090.00.270.06 30day MA0.00.00.00.180.00.750.16 Std_dev0.00.00.00.640.010.00.11
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Mean Ranking']) # 排序 meanplot = meanplot.sort_values('Mean Ranking', ascending=False) g=sns.factorplot(x="Mean Ranking", y="Feature", data = meanplot, kind="bar", size=14, aspect=1.9, palette='coolwarm')
原文标题:YYDS!使用 Python 全面分析股票数据特征
文章出处:【微信公众号:Linux爱好者】欢迎添加关注!文章转载请注明出处。
全部0条评论
快来发表一下你的评论吧 !