df['H-L'] = df['High'] - df['Low'] df['O-C'] = df['Adj Close'] - df['Open'] df['3day MA'] = df['Adj Close'].shift(1).rolling(window=3).mean() df['10day MA'] = df['Adj Close'].shift(1).rolling(window=10).mean() df['30day MA'] = df['Adj Close'].shift(1).rolling(window=30).mean() df['Std_dev'] = df['Adj Close'].rolling(5).std() df.dtypes
Open 0 High 0 Low 0 Close 0 Adj Close 0 Volume 0 H-L 0 O-C 0 3day MA 3 10day MA 10 30day MA 30 Std_dev 4 dtype: int64
df_missing_count = df.isnull().sum() # -1表示缺失数据 # 另一个不常见的设置画布的方法 plt.rcParams['figure.figsize'] = (15,8) df_missing_count.plot.bar() plt.show()
for column in df: print("column nunique NaN") print("{0:15} {1:6d} {2:6}".format( column, df[column].nunique(), (df[column] == -1).sum()))
column nunique NaN Open 1082 0 High 1083 0 Low 1025 0 Close 1098 0 Adj Close 1173 0 Volume 1250 0 H-L 357 0 O-C 1237 2 3day MA 1240 0 10day MA 1244 0 30day MA 1230 0 Std_dev 1252 0
import seaborn as sns # 一个设置色板的方法 # cmap = sns.diverging_palette(220, 10, as_cmap=True) sns.heatmap(df.iloc[:df.shape[0]].corr() ,annot = True, cmap = 'Blues')
columns_multi = [x for x in list(df.columns)] df.hist(layout = (3,4), column = columns_multi) # 一种不常用的调整画布大小的方法 fig=plt.gcf() fig.set_size_inches(20,9)
names = columns_multi df.plot(kind='density', subplots=True, layout=(3,4), sharex=False)
sns.pairplot(df, size=3, diag_kind="kde")
from sklearn.feature_selection import RFE,RFECV, f_regression from sklearn.linear_model import (LinearRegression, Ridge, Lasso,LarsCV) from stability_selection import StabilitySelection, RandomizedLasso from sklearn.preprocessing import MinMaxScaler from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVR
df = df.dropna() Y = df['Adj Close'].values X = df.values colnames = df.columns # 定义字典来存储的排名 ranks = {} # 创建函数,它将特征排名存储到rank字典中 def ranking(ranks, names, order=1): minmax = MinMaxScaler() ranks = minmax.fit_transform( order*np.array([ranks]).T).T[0] ranks = map(lambda x: round(x,2), ranks) res = dict(zip(names, ranks)) return res
# 使用线性回归 lr = LinearRegression(normalize=True) lr.fit(X,Y) ranks["LinReg"] = ranking(np.abs(lr.coef_), colnames) # 使用 Ridge ridge = Ridge(alpha = 7) ridge.fit(X,Y) ranks['Ridge'] = ranking(np.abs(ridge.coef_), colnames) # 使用 Lasso lasso = Lasso(alpha=.05) lasso.fit(X, Y) ranks["Lasso"] = ranking(np.abs(lasso.coef_), colnames)
属性。X_1 = dataset[['Open', 'High', 'Low', 'Volume', 'Increase_Decrease','Buy_Sell_on_Open', 'Buy_Sell', 'Returns']] y_1 = dataset['Adj Close'] # 创建决策树分类器对象 clf = RandomForestRegressor(random_state=0, n_jobs=-1) # 训练模型 model = clf.fit(X_1, y_1) # 计算特征重要性 importances = model.feature_importances_ # 按降序排序特性的重要性 indices = np.argsort(importances)[::-1] # 重新排列特性名称,使它们与已排序的特性重要性相匹配 names = [dataset.columns[i] for i in indices] # 创建画布 plt.figure(figsize=(10,6)) # 添加标题 plt.title("Feature Importance") # 添加柱状图 plt.bar(range(X.shape[1]), importances[indices]) # 为x轴添加特征名 plt.xticks(range(X.shape[1]), names, rotation=90)
属性。X2 = dataset[['Open', 'High', 'Low','Adj Close', 'Volume', 'Buy_Sell_on_Open', 'Buy_Sell', 'Returns']] y2 = dataset['Increase_Decrease'] clf = RandomForestClassifier(random_state=0, n_jobs=-1) model = clf.fit(X2, y2) importances = model.feature_importances_ indices = np.argsort(importances)[::-1] names = [dataset.columns[i] for i in indices] plt.figure(figsize=(10,6)) plt.title("Feature Importance") plt.bar(range(X2.shape[1]), importances[indices]) plt.xticks(range(X2.shape[1]), names, rotation=90) plt.show()
rf = RandomForestRegressor(n_jobs=-1, n_estimators=50, verbose=3) rf.fit(X,Y) ranks["RF"] = ranking(rf.feature_importances_, colnames); 下面介绍两个顶层特征选择算法,之所以叫做顶层,是因为他们都是建立在基于模型的特征选择方法基础之上的,例如回归和SVM,在不同的子集上建立模型,然后汇总最终确定特征得分。
lambda_grid = np.linspace(0.001, 0.5, num=100) rlasso = RandomizedLasso(alpha=0.04) selector = StabilitySelection(base_estimator=rlasso, lambda_name='alpha', lambda_grid=lambda_grid, threshold=0.9, verbose=1) selector.fit(X, Y) # 运行随机Lasso的选择稳定性方法 ranks["rlasso/Stability"] = ranking(np.abs(selector.stability_scores_.max(axis=1)), colnames) print('finished')
{'Open': 1.0, 'High': 1.0, 'Low': 0.76, 'Close': 1.0, 'Adj Close': 0.99, 'Volume': 0.0, 'H-L': 0.0, 'O-C': 1.0, '3day MA': 1.0, '10day MA': 0.27, '30day MA': 0.75, 'Std_dev': 0.0} finished
fig, ax = plot_stability_path(selector) fig.set_size_inches(15,6) fig.show()
# 获取所选特征的掩码或整数索引 selected_variables = selector.get_support(indices=True) selected_scores = selector.stability_scores_.max(axis=1) print('Selected variables are:') print('-----------------------') for idx, (variable, score) in enumerate( zip(selected_variables, selected_scores[selected_variables])): print('Variable %d: [%d], score %.3f' % (idx + 1, variable, score))
Selected variables are: ----------------------- Variable 1: [0], score 1.000 Variable 2: [1], score 1.000 Variable 3: [3], score 1.000 Variable 4: [4], score 0.990 Variable 5: [7], score 1.000 Variable 6: [8], score 1.000
sklearn.feature_selection.RFE(estimator, *, n_features_to_select=None, step=1, verbose=0, importance_getter='auto')
estimator Estimator instance 一种带有""拟合""方法的监督学评估器,它提供关于特征重要性的信息(例如"coef_"、"feature_importances_")。n_features_to_select int or float, default=None 要选择的功能的数量。如果'None',则选择一半的特性。如果为整数,则该参数为要选择的特征的绝对数量。如果浮点数在0和1之间,则表示要选择的特征的分数。step int or float, default=1 如果大于或等于1,那么'step'对应于每次迭代要删除的(整数)特征数。如果在(0.0,1.0)范围内,则'step'对应于每次迭代中要删除的特性的百分比(向下舍入)。verbose int, default=0 控制输出的冗长。importance_getter str or callable, default='auto' 如果是'auto',则通过估计器的'coef_'或'feature_importances_'属性使用特征重要性。
lr = LinearRegression(normalize=True) lr.fit(X,Y) # 当且仅当剩下最后一个特性时停止搜索 rfe = RFE(lr, n_features_to_select=1, verbose =3) rfe.fit(X,Y) ranks["RFE"] = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
Fitting estimator with 12 features. ... Fitting estimator with 2 features.
包,可以用于特征消除,还提供了 RFECV
,可以通过交叉验证来对的特征进行排序。# 实例化估计器和特征选择器 svr_mod = SVR(kernel="linear") rfecv = RFECV(svr_mod, cv=5) # 训练模型 rfecv.fit(X, Y) ranks["RFECV"] = ranking(list(map(float, rfecv.ranking_)), colnames, order=-1) # Print support and ranking print(rfecv.support_) print(rfecv.ranking_) print(X.columns)
# 删除第二步中不重要的特征 # X = X.drop('sex', axis=1) # 实例化 larscv = LarsCV(cv=5, normalize=False) # 训练模型 larscv.fit(X, Y) ranks["LarsCV"] = ranking(list(map(float, larscv.ranking_)), colnames, order=-1) # 输出r方和估计alpha值 print(larscv.score(X, Y)) print(larscv.alpha_) 以上是两个交叉验证,在对特征重要性要求高时可以使用。因运行时间有点长,这里大家可以自行运行得到结果。
r = {} for name in colnames: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") print(" %s" % " ".join(methods)) for name in colnames: print("%s %s" % (name, " ".join(map(str, [ranks[method][name] for method in methods]))))
LassoLinRegRFRFERidgerlasso/StabilityMean Open1. High0. Low0. Close0.140.00.640.550.321.00.44 Adj Close0. Volume0. H-L0. O-C0.851. 3day MA0. 10day MA0. 30day MA0. Std_dev0.
meanplot = pd.DataFrame(list(r.items()), columns= ['Feature','Mean Ranking']) # 排序 meanplot = meanplot.sort_values('Mean Ranking', ascending=False) g=sns.factorplot(x="Mean Ranking", y="Feature", data = meanplot, kind="bar", size=14, aspect=1.9, palette='coolwarm')
原文标题:YYDS!使用 Python 全面分析股票数据特征
快来发表一下你的评论吧 !