[英]CSV >> Tensorflow >> regression (via neural network) model
無盡的谷歌搜索讓我在 Python 和 numpy 方面得到了更好的教育,但在解決我的任務方面仍然一無所知。 我想讀取整數/浮點值的 CSV 並使用神經網絡預測值。 我發現了幾個讀取 Iris 數據集並進行分類的示例,但我不明白如何使它們用於回歸。 有人可以幫我連接點嗎?
這是輸入的一行:
16804,0,1,0,1,1,0,1,0,1,0,1,0,0,1,1,0,0,1,0,1,0,1,0,1, 0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,1,0,0,1,0,1,0,1,0, 1,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,1,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,1, 0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0, 0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,1,0,0, 0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0, 0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1, 0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0, 0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0, 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0.490265,0.620805,0.54977,0.869299,0.422268,0.351223,0.33572,0.68308,57040.8,5730.84,873.840.84 0.365993,6135.81
那應該是 925 個值。 最后一列是輸出。 第一個是 RowID。 大多數是二進制值,因為我已經完成了 one-hot 編碼。 測試文件沒有輸出/最后一列。 完整的訓練文件大約有 1000 萬行。 一般的 MxN 解決方案就可以了。
編輯:讓我們使用此示例數據,因為 Iris 是一個分類問題,但請注意,以上是我的真正目標。 我刪除了 ID 列。 讓我們在給定其他 6 列的情況下預測最后一列。 這有 45 行。 (來源: http : //www.stat.ufl.edu/~winner/data/civwar2.dat )
100,1861,5,2,3,5,38 112,1863,11,7,4,59.82,15.18 113,1862,34,32,1,79.65,2.65 90,1862,5,2,3,68.89 ,5.56 93,1862,14,10,4,61.29,17.2 179,1862,22,19,3,62.01,8.89 99,1861,22,16,6,67.68,27.27 2,11,6,1816 ,78.38,8.11 107,1863,17,11,5,60.75,5.61 156,1862,32,30,2,60.9,12.82 152,1862,23,21,2,73.55,6,1367 ,3,54.17,20.83 134,1862,22,21,1,67.91,9.7 180,1862,23,16,4,69.44,3.89 143,1863,23,19,4,81.12,16,103. ,12,2,31.82,9.09 157,1862,15,10,5,52.23,24.84 101,1863,4,1,3,58.42,18.81 115,1862,14,11,3,86.2816,20 ,7,6,1,70.87,0 90,1862,11,11,0,70,4.44 105,1862,20,17,3,80,4.76 104,1862,11,9,1,29.81,9.62 102 ,1862,17,10,7,49.02,6.86 112,1862,19,14,5,26.79,14.29 87,1862,6,3,3,8.05,72.41 92,1862,4,3,0,11. 86.96 108,1862,12,7,3,16.67,25 86,1864,0,0,0,2.33,11.63 82,1864,4,3,1,81.71,8.54 76,1864,1,0,1, 48.68,6.58 79,1864,0,0,0,15.19,21.52 85,1864,1,1,0,89.41,3.53 85,1864,1,1,0,56.47,0 85,1864,0,0, 0,31.76,15.29 87,1864,6,5,0,81.61,3.45 85,1864,5,5,0,72.94,0 83,1864,0,0,0,46.99,2.38 101,1 864,5,5,0,1.98,95.05 99,1864,6,6,0,42.42,9.09 10,1864,0,0,0,50,9 98,1864,6,6,0,79.59,3.06 10,1864,0,0,0,71,9 78,1864,5,5,0,70.51,1.28 89,1864,4,4,0,59.55,13.48
讓我補充一點,這是一項常見任務,但我讀過的任何論壇似乎都沒有回答,因此我提出了這個問題。 我可以給你我損壞的代碼,但我不想浪費你的時間在功能不正確的代碼上。 抱歉我這樣問過。 我只是不了解 API,文檔也沒有告訴我數據類型。
這是我將 CSV 讀入兩個 ndarray 的最新代碼:
#!/usr/bin/env python
import tensorflow as tf
import csv
import numpy as np
from numpy import genfromtxt
# Build Example Data is CSV format, but use Iris data
from sklearn import datasets
from sklearn.cross_validation import train_test_split
import sklearn
def buildDataFromIris():
iris = datasets.load_iris()
data = np.loadtxt(open("t100.csv.out","rb"),delimiter=",",skiprows=0)
labels = np.copy(data)
labels = labels[:,924]
print "labels: ", type (labels), labels.shape, labels.ndim
data = np.delete(data, [924], axis=1)
print "data: ", type (data), data.shape, data.ndim
這是我想使用的基本代碼。 這來自的示例也不完整。 以下鏈接中的 API 含糊不清。 如果我至少可以弄清楚輸入到 DNNRegressor 和文檔中的其他數據類型的數據類型,我也許可以編寫一些自定義代碼。
estimator = DNNRegressor(
feature_columns=[education_emb, occupation_emb],
hidden_units=[1024, 512, 256])
# Or estimator using the ProximalAdagradOptimizer optimizer with
# regularization.
estimator = DNNRegressor(
feature_columns=[education_emb, occupation_emb],
hidden_units=[1024, 512, 256],
optimizer=tf.train.ProximalAdagradOptimizer(
learning_rate=0.1,
l1_regularization_strength=0.001
))
# Input builders
def input_fn_train: # returns x, Y
pass
estimator.fit(input_fn=input_fn_train)
def input_fn_eval: # returns x, Y
pass
estimator.evaluate(input_fn=input_fn_eval)
estimator.predict(x=x)
然后最大的問題是如何讓這些協同工作。
這是我一直在看的幾頁。
我發現過去也很難弄清楚較低級別的 Tensorflow。 並且文檔並不令人驚奇。 如果您轉而專注於掌握sklearn
,您應該會發現使用skflow
相對容易。 skflow
級別比tensorflow
高得多,並且它的 api 幾乎與sklearn
相同。
現在回答:
作為回歸示例,我們將僅對 iris 數據集執行回歸。 現在這是一個愚蠢的想法,但這只是為了演示如何使用DNNRegressor
。
第一次使用新 API 時,請嘗試使用盡可能少的參數。 你只是想讓一些東西工作。 因此,我建議您可以像這樣設置DNNRegressor
:
estimator = skflow.DNNRegressor(hidden_units=[16, 16])
我保持我的 # 個隱藏單元很小,因為我現在沒有太多的計算能力。
然后你給它訓練數據train_X
和訓練標簽train_y
並按如下方式擬合它:
estimator.fit(train_X, train_y)
這是所有sklearn
分類器和回歸器的標准程序, skflow
只是將tensorflow
擴展為類似於sklearn
。 我還設置了參數steps = 10
以便在僅運行 10 次迭代時訓練完成得更快。
現在,如果您希望它預測一些新數據test_X
,您可以按如下方式進行:
pred = estimator.predict(test_X)
同樣,這是所有sklearn
代碼的標准程序。 就是這樣 - skflow
非常簡單,你只需要這三行!
如果您不太熟悉機器學習,您的訓練數據通常是大小為 M xd 的ndarray
(矩陣),其中有 M 個訓練示例和 d 個特征。 你的標簽是 M x 1 (形狀為(M,)
ndarray
)。
所以你所擁有的是這樣的:
Features: Sepal Width Sepal Length ... Labels
[ 5.1 2.5 ] [0 (setosa) ]
X = [ 2.3 2.4 ] y = [1 (virginica) ]
[ ... ... ] [ .... ]
[ 1.3 4.5 ] [2 (Versicolour)]
(請注意,我只是把所有這些數字都編了起來)。
測試數據只是一個 N xd 矩陣,其中有 N 個測試示例。 測試示例都需要有 d 個特征。 預測函數將接收測試數據並返回給您形狀為 N x 1 的測試標簽(形狀為(N,)
ndarray
)
您沒有提供 .csv 文件,因此我會讓您將數據解析為該格式。 不過方便的是,我們可以使用sklearn.datsets.load_iris()
來獲得我們想要的X
和y
。 只是
iris = datasets.load_iris()
X = iris.data
y = iris.target
DNNRegressor
的輸出將是一堆實數(如 1.6789)。 但是 iris-dataset 有標簽 0、1 和 2——Setosa、Versicolour 和 Virginia 的整數 ID。 要使用此回歸器進行分類,我們只需將其四舍五入到最近的標簽 (0, 1, 2)。 例如,1.6789 的預測將四舍五入為 2。
我發現我從一個工作示例中學到的東西最多。 所以這是一個非常簡化的工作示例:
隨意發表任何進一步的問題作為評論。
我最終得到了幾個選項。 我不知道為什么起床和跑步如此困難。 首先,這是基於@user2570465 的代碼。
import tensorflow as tf
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
import tensorflow.contrib.learn as skflow
def buildDataFromIris():
iris = datasets.load_iris()
return iris.data, iris.target
X, y = buildDataFromIris()
feature_cols = tf.contrib.learn.infer_real_valued_columns_from_input(X)
estimator = skflow.DNNRegressor( feature_columns=feature_cols, hidden_units=[10, 10])
train_X, test_X, train_y, test_y = train_test_split(X, y)
estimator.fit(X, y, steps=10)
test_preds = estimator.predict(test_X)
def CalculateAccuracy(X, y):
continuous_predictions = estimator.predict(X)
closest_class = []
for pred in continuous_predictions:
differences = np.array([abs(pred-1), abs(pred-1), abs(pred-1)])
closest_class.append(np.argmin(differences))
num_correct = np.sum(closest_class == y)
accuracy = float(num_correct)/len(y)
return accuracy
train_accuracy = CalculateAccuracy(train_X, train_y)
test_accuracy = CalculateAccuracy(test_X, test_y)
print("Train accuracy: %f" % train_accuracy)
print("Test accuracy: %f" % test_accuracy)
其他解決方案從較小的組件構建模型。 這是一個計算 Sig(X*W1+b1)*W2+b2 = Y. Optimizer=Adam, loss=L2, eval=L2 和 MSE 的片段。
x_train = X[:train_size]
y_train = Y[:train_size]
x_val = X[train_size:]
y_val = Y[train_size:]
print("x_train: {}".format(x_train.shape))
x_train = all_x[:train_size]
print("x_train: {}".format(x_train.shape))
# y_train = func(x_train)
# x_val = all_x[train_size:]
# y_val = func(x_val)
# plt.figure(1)
# plt.scatter(x_train, y_train, c='blue', label='train')
# plt.scatter(x_val, y_val, c='red', label='validation')
# plt.legend()
# plt.savefig("../img/nn_mlp1.png")
#build the model
"""
X = [
"""
X = tf.placeholder(tf.float32, [None, n_input], name = 'X')
Y = tf.placeholder(tf.float32, [None, n_output], name = 'Y')
w_h = tf.Variable(tf.random_uniform([n_input, layer1_neurons], minval=-1, maxval=1, dtype=tf.float32))
b_h = tf.Variable(tf.zeros([1, layer1_neurons], dtype=tf.float32))
h = tf.nn.sigmoid(tf.matmul(X, w_h) + b_h)
w_o = tf.Variable(tf.random_uniform([layer1_neurons, 1], minval=-1, maxval=1, dtype=tf.float32))
b_o = tf.Variable(tf.zeros([1, 1], dtype=tf.float32))
model = tf.matmul(h, w_o) + b_o
train_op = tf.train.AdamOptimizer().minimize(tf.nn.l2_loss(model - Y))
tf.nn.l2_loss(model - Y)
#output = sum((model - Y) ** 2)/2
output = tf.reduce_sum(tf.square(model - Y))/2
#launch the session
sess = tf.Session()
sess.run(tf.initialize_all_variables())
errors = []
for i in range(numEpochs):
for start, end in zip(range(0, len(x_train), batchSize), range(batchSize, len(x_train), batchSize)):
sess.run(train_op, feed_dict={X: x_train[start:end], Y: y_train[start:end]})
cost = sess.run(tf.nn.l2_loss(model - y_val), feed_dict={X: x_val})
errors.append(cost)
if i%100 == 0: print("epoch %d, cost = %g" % (i,cost))
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.