Optimization-With-Recursion/recursion_implementation.py at master · makasimba/Optimization-With-Recursion · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
"""
This script contains the basic building blocks for creating a
deep feed-forward neural network from scratch.

Typical usage example:
    # define neural network hyperparameters
    hyperparameters = {
        'nn_structure': (4, 4, 1),
        'epochs': 5_000,
        'lr': 0.0075,
        'nonlinear_functions': {'relu': relu, 'sigmoid': sigmoid},
    }

    hyperparameters['L'] = len(hyperparameters['nn_structure']) - 1

    X_train, Y_train, X_test, Y_test = load_dataset()
    #Train model
    parameters, costs = nn_model(X_train, Y_train, hyperparameters)
    #Test model
    y_hat, accuracy = predict(parameters, hyperparameters, X_test, Y_test)
"""

import numpy as np

g = np.random.default_rng(42)


def initialize_parameters(layers):
    """
    Randomly initializes the weights and biases of the neural network

    Arguments:
    layers (iterable)--contains the number of neurons in each layer of
                       the neural network - including the input layer.

    Returns:
    parameters (dict) -- python dictionary containing model parameters
                         "W1", "b1", ... , "WL", "bL"
    """
    parameters = dict()
    L = len(layers)

    for l in range(1, L):
        n_l, n_p = layers[l], layers[l-1]
        parameters['W%d' % l] = g.normal(size=(n_l, n_p), scale=0.01)
        parameters['b%d' % l] = np.zeros((n_l, 1))
    return parameters


def relu(Z):
    """
    Assumes Z is a numpy ndarray. Returns the activations of Z

    Arguments:
    Z -- an numpy ndarray of any shape

    Returns:
    A -- an numpy ndarray of activations. Same shape as Z
    """
    return np.maximum(0, Z)


def sigmoid(Z):
    """
    Assumes Z in an numpy ndarray. Returns the activation of Z

    Arugments:
    Z -- an numpy ndarray of any shape

    Returns:
    A -- an numpy ndarray of activations. Same shape as Z
    """
    return 1.0 / (1.0 + np.exp(-Z))


def forward_propagate(A, W, b, g):
    """
    Implements the forward propagation of a signal.

    Arguments:
    A -- an numpy ndarray of activations from previous layer
    W -- a numpy ndarray of weights
    b -- bias, numpy ndarray of shape (size of the current layer, 1)
    g -- a non-linear activation function i.e (tanh, relu, sigmoid)

    Returns:
    A -- activations for current layer
    """
    Z = W.dot(A) + b
    return g(Z), Z


def compute_cost(yhat, Y, m):
    """
    Implements the cross-entropy loss function

    Arguments:
    yhat -- the neural network output, a numpy ndarray. Same shape as Y
    Y -- a numpy ndarray of target labels.
    m (int) -- number of samples in the training set.

    Returns:
    J (float) -- the overall cost
    """
    return (- 1 / m) * np.sum((Y * np.log(yhat)) + ((1-Y) * np.log(1-yhat)))


def sigmoid_derivative(Z):
    """
    Assumes Z is the input of the sigmoid function. Returns the
    derivative of the sigmoid function with respect to Z.

    Arguments:
    Z -- input passed into the sigmoid function, an numpy ndarray

    Returns:
    sigmoid'(Z) -- the derivative of the sigmoid function with respect to Z
    """
    A = sigmoid(Z)
    return A * (1 - A)


def relu_derivative(Z):
    """
    Assumes Z is the input to the ReLU function. Returns the
    derivative of the ReLU function with respect to Z

    Arguments:
    Z -- input to the ReLU function, an numpy ndarray

    Returns:
    ReLU'(Z) -- the derivative of the ReLU function with respect to Z
    """
    return np.array(Z > 0, dtype=np.float32)


def tanh_derivative(Z):
    """
    Assumes Z in the input to the tanh function (g). Returns the
    derivative of the tanh function with respect to Z

    Arguments:
    Z -- input to the tanh function, an numpy ndarray

    Returns:
    tanh'(Z) -- the derivative of the tanh function with respect to Z
    """
    a = np.tanh(Z)
    return 1 - (a ** 2)


def retrieve_weights_for_layer(l, parameters):
    """
    Returns the weights and bias for layer l

    Arguments:
    l (int) -- layer number
    parameters (dict) -- containing all the weights and biases of the
                         entire network

    Returns:
    tuple (weights, bias) -- weights and bias matrixes
    """
    return parameters['W' + str(l)], parameters['b' + str(l)]


def nonlinear_function_for_layer(l, hyperparameters):
    """
    Returns the ReLU activation function for hidden layers of the
    neural network and sigmoid activation function for the output layer
     (L).

    Arguments:
    l (int) -- layer number
    hyperparameters (dict) -- the neural network hyperparameters.

    Returns:
    g -- an activation function
    """
    activation_functions = hyperparameters['nonlinear_functions']
    g = (activation_functions['sigmoid'] if l == hyperparameters['L']
         else activation_functions['relu'])
    return g


def back_propagate(dA, A_prev, Z, W, b, l, L, m, parameters, hyperparameters):
    """
    Returns the derivative of L with respect to A[l-1] ( dA[l-1] ).
    Also updates the weights and biases for layer l.

    Arguments:
    dA -- the derivative of the cost with respect to A[l]
    A_prev -- input to layer l
    Z -- Z value for layer l
    W -- weights for layer l
    l (int) -- layer number
    L (int) -- the last layer
    m (int) -- number of training examples
    parameters (dict) -- network weights and biases

    Returns:
    dA[l-1] -- the derivative of the loss(L) with respective to A[l-1]
    """
    derivative = sigmoid_derivative if l == L else relu_derivative
    g_prime = derivative(Z)

    dZ = dA * g_prime
    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)

    alpha = hyperparameters['lr']

    parameters['W' + str(l)] -= alpha * dW
    parameters['b' + str(l)] -= alpha * db

    if l != 1:
        return np.dot(W.T, dZ)


def optimize(parameters, hyperparameters, A_prev, Y, l):
    """
    Recursive function that implements both forward and backward
    propagation. The base case initializes back propagation by
    computing and returning dLdAL, the derivative of the loss with
    respective to the neural network output. The inductive case forward
     propagate as long as the last layer hasn't been reached yet.

    Arguments:
    A -- numpy ndarray of activations from previous layer
    Y -- numpy ndarray of target labels
    parameters (dict) -- containing all the weights and biases for each
                         layer in the network
    hyperparameters (dict) -- hyperparameters (anything that is not a
                              weight or a bias)
    l (int) -- the current layer

    Returns:
    dA[l-1] -- the derivative of the loss with respect to A[l-1]
    """

    L = hyperparameters.get('L')
    m = np.shape(Y)[1]

    if l == L+1:
        hyperparameters["J"] = compute_cost(A_prev, Y, m)
        dA = np.divide(-Y, A_prev) + np.divide(1-Y, 1-A_prev)
        return dA
    else:
        W, b = retrieve_weights_for_layer(l, parameters)
        g = nonlinear_function_for_layer(l, hyperparameters)
        A, Z = forward_propagate(A_prev, W, b, g)
        dA = optimize(
            parameters, hyperparameters, A, Y, l+1)

    return back_propagate(dA, A_prev, Z, W, b, l, L, m, parameters, hyperparameters)


def train(A, Y, parameters, hyperparameters):
    """
    Optimizes parameters using back propagation and gradient descent.

    Arguments:
    A -- X, the input matrix
    Y -- the target labels
    parameters -- dict, of randomly initialized weights and biases for
                  the entire neural network.
    hyperparameters -- dict, neural network hyperparameters ( anything
                       that is not a weight or bias )

    Returns:
    parameters -- optimized weights and biases that can be used for
                  making predictions
    costs -- list of the cost (J) for each iteration
    """
    costs = list()
    epochs = hyperparameters.get('epochs')

    for t in range(epochs):
        optimize(parameters, hyperparameters, A, Y, 1)
        if t % 100 == 0 or t == epochs-1:
            J = hyperparameters.get('J')
            costs.append(J)
    print(f"Cost after iteration {t}: {J}")

    return parameters, costs


def nn_model(A, Y, hyperparameters):
    """
    Initializes, optimizes and returns the weights of a neural network
    """
    parameters = initialize_parameters(hyperparameters.get('nn_structure'))
    parameters, costs = train(A, Y, parameters, hyperparameters)
    return parameters, costs


def predict(parameters, hyperparameters=None, A=None, Y=None, evaluation=None):
    """
    Returns the predicted probabilities, predicted classes and accuracy
     of the model.

    Arguments:
    parameters -- weights and biases of the model
    hyperparameters -- the model's hyperparameters
    A -- input to the model
    Y -- target labels
    evaluation -- bool, returns accuracy if set to True


    Returns:
    y_hat -- predicted class
    pred -- predicted probabilites
    acc -- the model's accuracy
    """
    if not hyperparameters is None:
        L = hyperparameters['L']
    else:
        L = 1

    w_and_b = [(parameters.get('W%d' % l), parameters.get('b%d' % l))
               for l in range(1, L+1)]
    *hidden_layers_w_and_b, output_layer_w_and_b = w_and_b

    for W, b in hidden_layers_w_and_b:
        A = relu(np.dot(W, A) + b)
    W, b = output_layer_w_and_b
    pred = sigmoid(W.dot(A) + b)

    yhat = np.where(pred > 0.5, 1, 0)
    return yhat, np.mean(yhat == Y) if evaluation else yhat