Build a DeepLearning algorithm from scratch in F# 02 – Planar data classification with one hidden layer

Welcome to the second post of our “Build a DeepLearning algorithm from scratch in F#” serie.

In our previous article, we described how to implement a logistic regression with a neural network mindset. In that scneario, the input layer is connected directly to the output layer.

In today’s post, we’ll build our first neural network with a hidden layer.
You will see a big difference between this model and the one we implemented using logistic regression.

You will learn how to:

  • Implement a 2-class classification neural network with a single hidden layer
  • Use units with a non-linear activation function, such as tanh
  • Compute the cross entropy loss
  • Implement forward and backward propagation

The F# jupyter notebook corresponding to this post is hosted on my personal blog and can be downloaded here.

Importing nuget packages

1
2
3
4
5
6
#load "Paket.fsx"

Paket.Package
    [
        "FsLab"
    ]
1
#load "Paket.Generated.Refs.fsx"
1
2
3
load FSharp.Data
load FSharp.Data.CsvExtensions
load MathNet.Numerics.LinearAlgebra
1
2
3
#load "XPlot.Plotly.Paket.fsx"
#load "XPlot.Plotly.fsx"
load XPlot.Plotly

Load Dataset

For simplicity, I exported the original notebook’s dataset to two csv files using a python script, one for X values, one for Y values (labels).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
let parse_csv_x (x:CsvFile) =
    seq {
        for row in x.Rows do
            let rowValues = row.Columns
                            |> Seq.map (fun c -> float c)
                            |> Seq.toArray
                       
            yield [| rowValues.[0]; rowValues.[1] |]
        }
       
let parse_csv_y (x:CsvFile) =
    seq {
        for row in x.Rows do
            let rowValues = row.Columns
                            |> Seq.map (fun c -> float c)
                            |> Seq.toArray
                       
            yield rowValues.[0]
        }
       
let load_dataset (csv_x:CsvFile) (csv_Y:CsvFile) =
    let parsed_X_rows =  parse_csv_x csv_x
    let parsed_Y_rows = parse_csv_y csv_Y
   
    let X = parsed_X_rows |> Seq.toArray |> DenseMatrix.ofColumnArrays
    let Y = parsed_Y_rows |> DenseVector.ofSeq
    X,Y

//building our datasets
let csv_X =  CsvFile.Load("C:\\PathToNotebook\\datasets\\X.csv", ",", ''', false, true, 0)
let csv_Y =  CsvFile.Load("C:\\PathToNotebook\\datasets\\Y.csv", ",", '
'', false, true, 0)

let X,Y = load_dataset csv_X csv_Y
1
2
3
4
5
6
7
type MatrixVectorDU =
        | Vector of Vector<float>
        | Matrix of Matrix<float>

let shape x = match x with
                    | Vector n -> (1, n.Count)
                    | Matrix n -> (n.RowCount, n.ColumnCount)

We will now visualize our planar dataset by plotting it as a scatter plot.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
let extract_scatter_serie(x:Matrix<float>) (y:Vector<float>) (serie_label:float) =
    seq {
        for i in 0..x.ColumnCount-1 do
            let row_y = y.[i]
            let row_x = x.Column(i)
            if row_y = serie_label then yield row_x.[0], row_x.[1]
        }

let build_scatter(x:Matrix<float>) (y:Vector<float>) =
    extract_scatter_serie x y 1.0, extract_scatter_serie x y 0.0

let logisctic_scatter = build_scatter X Y
     
let x1 = (fst logisctic_scatter) |> Seq.map (fun x -> fst x)
let y1 = (fst logisctic_scatter) |> Seq.map (fun x -> snd x)
let x2 = (snd logisctic_scatter) |> Seq.map (fun x -> fst x)
let y2 = (snd logisctic_scatter) |> Seq.map (fun x -> snd x)

let scatter_trace_1 =
    Scatter(
                x = x1,
                y = y1,
                name = "1",
                mode = "markers"
    ) :> Trace

let scatter_trace_2 =
    Scatter(
                x = x2,
                y = y2,
                name = "0",
                mode = "markers"
    ) :> Trace
   
[scatter_trace_1; scatter_trace_2]
    |> Chart.Plot
    |> Chart.WithXTitle "X1"
    |> Chart.WithYTitle "X2"
    |> Chart.WithLegend false

1
2
3
4
5
6
7
let shape_X = shape (MatrixVectorDU.Matrix X)
let shape_Y = shape (MatrixVectorDU.Vector Y)
let m = snd shape_Y //training set size

printfn "shape of X is %s" (string shape_X)
printfn "shape of Y is %s" (string shape_Y)
printfn "m is %s" (string m)

shape of X is (2, 400)
shape of Y is (1, 400)
m is 400

Expected Output:

**shape of X** (2, 400)
**shape of Y** (1, 400)
**m** 400

3 – Simple Logistic Regression

Before building a full neural network, lets first see how logistic regression performs on this problem. The original python notebook use sklearn’s built-in functions to do the logistic regression. What we will do here is re-use the logistic regression we implemented in our previous article. I wrapped up the code into a LogisticRegression module for ease or use.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
let createVector size value =
    seq {
        for _ in 0..size-1  do
            yield value
        }

let initialize_with_zeros dim =
    let w = createVector dim 0.0 |> DenseVector.ofSeq
    let b = 0.0
    w, b

let sigmoid (z:Vector<float>) =
    z.Map(fun x -> -x)
        |> (fun x -> x.PointwiseExp())
        |> (fun x -> 1.0 + x)
        |> (fun x -> 1.0 / x)

module public LogisticRegression =

    type public LogisticRegressionTrainResult =  {
        w: Vector<float>;
        b: float;
        dw: Vector<float>;
        db: float;
        costs: seq<float>
        }

    type public LogisticRegressionEvaluateResult =  {
        costs: seq<float>;
        Y_prediction_test: Vector<float>;
        Y_prediction_train: Vector<float>;
        w: Vector<float>;
        b: float;
        }

    type public LogisticRegressionModel () =
       
       member val w = vector[ 0.0 ] with get, set
       member val b = 0.0 with get, set
       member val costs = Seq.empty<float> with get, set

       member this.Propagate (X:Matrix<float>) (Y:Vector<float>) =
        let m = X.ColumnCount
        let A = X.LeftMultiply this.w
                |> Vector.map (fun x -> x + this.b)
                |> sigmoid
        let cost = A.PointwiseLog().PointwiseMultiply(Y) + Y.Map(fun y -> 1.0 - y).PointwiseMultiply(A.Map(fun a -> 1.0 - a).PointwiseLog())  
                |> (fun x -> x.Sum() |> float)
                |> (fun x -> -(1.0/ float m * x ))
        let dw =  X * (A - Y) |> (fun x -> (1.0/ float m * x ))
        let db =  A - Y
                |> (fun x -> x.Sum() |> float)
                |> (fun x -> (1.0/ float m * x ))

        (dw, db), cost

       member this.Fit (X:Matrix<float>) (Y:Vector<float>) (num_iterations:int) (learning_rate:float) (print_cost:bool) =
        let mutable dw = vector [0.0]
        let mutable db = 0.0
        let dim = shape (MatrixVectorDU.Matrix X) |> fst
        let init = initialize_with_zeros (dim)
        this.w <- fst init
        this.b <- snd init
        for i in 0..num_iterations - 1 do
            let results = this.Propagate X Y
            let grads = fst results
            let cost = snd results
            dw <- fst grads
            db <- snd grads
            this.w <- this.w - learning_rate * dw
            this.b <- this.b - learning_rate * db
            if i % 100 = 0 then this.costs <- Seq.append this.costs [cost]
   
            if print_cost && i % 100 = 0 then (printfn "Cost after iteration %i: %f" i cost)

        {
            w = this.w;
            b = this.b;
            dw = dw;
            db = db;
            costs = this.costs
        }

        member this.Predict (X:Matrix<float>) =
         X.LeftMultiply this.w
                |> Vector.map (fun x -> x + this.b)
                |> sigmoid
                |> Vector.map (fun x -> if x > 0.5 then 1.0 else 0.0)


        member this.Evaluate(x_train:Matrix<float>) (y_train:Vector<float>) (x_test:Matrix<float>) (y_test:Vector<float>) =
            let Y_prediction_test = this.Predict x_test
            let Y_prediction_train = this.Predict x_train

            //Print train/test Errors
            printfn "train accuracy: {%f}" (100.0 * (1.0 - (Y_prediction_train - y_train |> Vector.map(fun x -> abs x) |> Vector.toArray |> Array.average)))
            printfn "test accuracy: {%f} " (100.0 * (1.0 - (Y_prediction_test - y_test |> Vector.map (fun x -> abs x) |> Vector.toArray |> Array.average)))
1
2
3
let logistic_regression_model = LogisticRegression.LogisticRegressionModel()
logistic_regression_model.Fit X Y 2000 0.005 false
logistic_regression_model.Evaluate X Y X Y

train accuracy: {47.500000}
test accuracy: {47.500000}

We can see our accuracy is quite low. Let’s plot the decision boundary for this model to get an idea of what the issue is.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
let generate_seq_by x y steps =
    seq {
        let mutable i = x
        while i < y + steps do
            yield i
            i <- i + steps
        }

let generate_key_value_list (x:float) (y:seq<float>) =
    seq {
        for value in y do
            yield [ value; x ]
        }

let extract_heatmap_serie startx starty endx endy steps =
    seq {
        let x = generate_seq_by startx endx steps
        let y = generate_seq_by starty endy steps
       
        for value in x do
            let pair_as_matrix = generate_key_value_list value y |> DenseMatrix.ofColumnSeq
            let predictions = logistic_regression_model.Predict pair_as_matrix
            yield predictions |> Vector.toList
        }
       
let logisctic_heatmap = extract_heatmap_serie -4.0 -4.0 4.0 4.0 0.01 |> Seq.toList
let grid_values = generate_seq_by -4.0 4.0 0.01 |> Seq.toList

let heatmap_trace = Heatmap (
                        z = logisctic_heatmap,
                        x = grid_values,
                        y = grid_values,
                        showlegend = false,
                        showscale = false
                        ) :> Trace
                       
[heatmap_trace; scatter_trace_1; scatter_trace_2]                    
    |> Chart.Plot
    |> Chart.WithTitle "Logistic Regression"
    |> Chart.WithXTitle "X1"
    |> Chart.WithYTitle "X2"
    |> Chart.WithLegend false
 

Interpretation: The dataset is not linearly separable, so logistic regression doesn’t perform well. Hopefully a neural network will do better. Let’s try this now!

4 – Neural Network model

Logistic regression did not work well on the “flower dataset”. We are going to train a Neural Network with a single hidden layer.

Here is our model:

Mathematically:

For one example \(x^{(i)}\):
$$z^{[1] (i)} = W^{[1]} x^{(i)} + b^{[1] (i)}\tag{1}$$
$$a^{[1] (i)} = \tanh(z^{[1] (i)})\tag{2}$$
$$z^{[2] (i)} = W^{[2]} a^{[1] (i)} + b^{[2] (i)}\tag{3}$$
$$\hat{y}^{(i)} = a^{[2] (i)} = \sigma(z^{ [2] (i)})\tag{4}$$
$$y^{(i)}_{prediction} = \begin{cases} 1 & \mbox{if } a^{[2](i)} > 0.5 \\ 0 & \mbox{otherwise } \end{cases}\tag{5}$$

Given the predictions on all the examples, you can also compute the cost $J$ as follows:
$$J = – \frac{1}{m} \sum\limits_{i = 0}^{m} \large\left(\small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large \right) \small \tag{6}$$

Reminder: The general methodology to build a Neural Network is to:

1. Define the neural network structure ( # of input units, # of hidden units, etc).
2. Initialize the model’s parameters
3. Loop:
– Implement forward propagation
– Compute loss
– Implement backward propagation to get the gradients
– Update parameters (gradient descent)

4.1 – Defining the neural network structure

Our neural network will need to be set up using 3 sizes :

  • input layer size
  • size of our hidden layer (how many neurons)
  • size of our output layer (how many classes)
1
2
3
4
5
let layer_sizes (x:Matrix<float>) (y:Vector<float>) (hidden_layer_size:int) =
    let n_x =  shape (MatrixVectorDU.Matrix x) |> fst // size of input layer
    let n_h = hidden_layer_size
    let n_y = shape (MatrixVectorDU.Vector y) |> fst// size of output layer
    [| n_x; n_h; n_y |]

4.2 – Initialize the model’s parameters

We will now implement the function initialize_parameters.

Instructions:

  • We will initialize the weights matrices with random values.
  • We will initialize the bias vectors as zeros.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
type public NnParameter = {
    W1: Matrix<float>;
    b1: Vector<float>;
    W2: Matrix<float>;
    b2: Vector<float>;
    }

let initialize_parameters n_x n_h n_y =
    let W1 =  Matrix.Build.Random(n_h, n_x) * 0.01
    let b1 = createVector n_h 0.0 |> DenseVector.ofSeq
   
    let W2 = Matrix.Build.Random(n_y, n_h) * 0.01
    let b2 = createVector n_y 0.0 |> DenseVector.ofSeq
   
    assert (shape (MatrixVectorDU.Matrix W1) = (n_h, n_x)) //matrix, sizeof hidden layer * sizeof input layer, as each neuron in hidden layer will compute weights for input x
    assert (shape (MatrixVectorDU.Vector b1) = (n_h, 1)) //vector, same size as hidden layer as each neuron will compute it's own b
    assert (shape (MatrixVectorDU.Matrix W2) = (n_y, n_h)) //matrix, sizeof output layer * sizeof hidden layer, as each neuron in output layer will compute weights for input x from previous layer (here from our hidden layer)
    assert (shape (MatrixVectorDU.Vector b2) = (n_y, 1)) //vector, same size as ouput layer as neuron on output layer will compute it's own b
   
    {
        W1 = W1;
        b1 = b1;
        W2 = W2;
        b2 = b2;
    }

4.3 – The Loop

We will now implement the forward_propagation function.

Instructions:

  • The steps you have to implement are:
    1. Retrieve each parameter from the object “parameters” (which at first is the output of initialize_parameters).
    2. Compute \(Z^{[1]}, A^{[1]}, Z^{[2]}\) and \(A^{[2]}\) (the vector of all our predictions on all the examples in the training set).
  • Values needed in the backpropagation are stored in “cache”. The cache will be given as an input to the backpropagation function.

In python, numpy facilitate the job by allowing a dot product between a matrix of shape (2,400) by a matrix of shape (4,2). It transposes the second matrix, applies dot product by taking each column as vector, and concatenate these vectors as the result matrix. If trying to do the same operation directly using mathnet, it won’t work and complains that it cannot do such operation, se we have to do so manually.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
type public ForwardPropagationResult = {
    Z1: Matrix<float>;
    A1: Matrix<float>;
    Z2: Matrix<float>;
    A2: Vector<float>;
    }
   
let multiply_matrix_by_matrix_of_another_size_and_add_b (X:Matrix<float>) (weights:Matrix<float>) (b:Vector<float>) =
    let weight_shape = shape (MatrixVectorDU.Matrix weights)
    let mutable weights_vectors = Seq.empty<Vector<float>>
    for i in 0..(snd weight_shape) - 1 do
        let ax_plus_b = X.LeftMultiply (weights.Column(i) |> DenseVector.ofSeq) |> Vector.map (fun x -> x + b.[i])
        weights_vectors <- Seq.append weights_vectors [ax_plus_b]
    weights_vectors |> DenseMatrix.ofRowSeq
   
let forward_propagation (X:Matrix<float>) (parameters:NnParameter) =

    let W1 = parameters.W1
    let b1 = parameters.b1
    let W2 = parameters.W2
    let b2 = parameters.b2
   
    //Implement Forward Propagation
    let Z1 = multiply_matrix_by_matrix_of_another_size_and_add_b X (W1.Transpose()) b1
    let A1 = Z1 |> Matrix.map (fun x -> tanh x) //hidden layer uses tanh

    let Z2 = multiply_matrix_by_matrix_of_another_size_and_add_b A1 (W2.Transpose()) b2 //input is output from previous Layer, A1
    let A2 = sigmoid (Z2.Row(0) |> DenseVector.ofSeq) //output layer uses sigmoid

    let shape_X = shape (MatrixVectorDU.Matrix X)
    assert ( shape (MatrixVectorDU.Vector A2) = (1, (fst shape_X)) )
   
    let cache = {
                    Z1 = Z1;
                    A1 = A1;
                    Z2 = Z2;
                    A2 = A2;
                 }
   
    A2, cache

Now that you have computed \(A^{[2]}\) (in the variable “A2”), which contains \(a^{[2](i)}\) for every example, you can compute the cost function as follows:

$$J = – \frac{1}{m} \sum\limits_{i = 0}^{m} \large{(} \small y^{(i)}\log\left(a^{[2] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[2] (i)}\right) \large{)} \small\tag{13}$$

1
2
3
4
5
6
let compute_cost (A2:Vector<float>) (Y:Vector<float>) =
    //number of examples
    let m = shape (MatrixVectorDU.Vector Y) |> snd |> float
    //Compute the cross-entropy cost
    let logprobs =  A2.PointwiseLog().PointwiseMultiply(Y) + A2.Map(fun a -> 1.0 - a).PointwiseLog().PointwiseMultiply(Y.Map(fun y -> 1.0 - y))
    - logprobs.Sum() |> float |> (fun x -> x / m)
1
2
let compute_result = compute_cost (vector[ 0.5002307; 0.49985831;  0.50023963]) (vector[ 1.0; 0.0; 0.0])
printfn "cost = %s" (string compute_result)

cost = 0.693058761039465

Expected Output:

**cost** 0.693058761…

We can now implement the function backward_propagation.

Instructions:
Backpropagation is usually the hardest (most mathematical) part in deep learning. These are the equations we’ll be implementing to find dW1, db1, dW2, db2

\(\frac{\partial \mathcal{J} }{ \partial z_{2}^{(i)} } = \frac{1}{m} (a^{[2](i)} – y^{(i)})\)

\(\frac{\partial \mathcal{J} }{ \partial W_2 } = \frac{\partial \mathcal{J} }{ \partial z_{2}^{(i)} } a^{[1] (i) T} \)

\(\frac{\partial \mathcal{J} }{ \partial b_2 } = \sum_i{\frac{\partial \mathcal{J} }{ \partial z_{2}^{(i)}}}\)

\(\frac{\partial \mathcal{J} }{ \partial z_{1}^{(i)} } = W_2^T \frac{\partial \mathcal{J} }{ \partial z_{2}^{(i)} } * ( 1 – a^{[1] (i) 2}) \)

\(\frac{\partial \mathcal{J} }{ \partial W_1 } = \frac{\partial \mathcal{J} }{ \partial z_{1}^{(i)} } X^T \)

\(\frac{\partial \mathcal{J} _i }{ \partial b_1 } = \sum_i{\frac{\partial \mathcal{J} }{ \partial z_{1}^{(i)}}}\)

  • Note that \(*\) denotes elementwise multiplication.
  • The notation you will use is common in deep learning coding:
    • dW1 = \(\frac{\partial \mathcal{J} }{ \partial W_1 }\)
    • db1 = \(\frac{\partial \mathcal{J} }{ \partial b_1 }\)
    • dW2 = \(\frac{\partial \mathcal{J} }{ \partial W_2 }\)
    • db2 = \(\frac{\partial \mathcal{J} }{ \partial b_2 }\)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
type public BackwardPropagationResult = {
    dW2: Vector<float>;
    db2: float;
    dW1: Matrix<float>;
    db1: Vector<float>;
    }
   
let multiply_matrix_by_vectorx_of_different_row_count (X:Matrix<float>) (v:Vector<float>) =
    let matrix_shape = shape (MatrixVectorDU.Matrix X)
    let mutable column_vectors = Seq.empty<Vector<float>>
    for i in 0..(snd matrix_shape) - 1 do
        let column = X.Column(i)
        let mutable new_values = Seq.empty<float>
        for row_value in column do
            for vector_value in v do
                let multiply_result = row_value * vector_value
                new_values  <- Seq.append new_values [multiply_result]
        let new_vector = new_values |> DenseVector.ofSeq
        column_vectors <- Seq.append column_vectors [new_vector]
    column_vectors |> DenseMatrix.ofRowSeq

let backward_propagation (parameters:NnParameter) (cache:ForwardPropagationResult) (X:Matrix<float>) (Y:Vector<float>) =

    let m = shape (MatrixVectorDU.Matrix X) |> snd |> float
   
    //First, retrieve W1 and W2 from the dictionary "parameters".
    let W1 = parameters.W1
    let W2 = parameters.W2
    let A1 = cache.A1
    let A2 = cache.A2
   
    //Backward propagation: calculate dW1, db1, dW2, db2.
    let dZ2 = A2 - Y
    let dW2 = 1.0 / m * (dZ2 * A1.Transpose())
    let db2 = 1.0 / m * dZ2.Sum()    

    let dZ1 = (multiply_matrix_by_vectorx_of_different_row_count W2 dZ2).PointwiseMultiply((1.0 - (A1 |> Matrix.map (fun x -> x*x)) ))
    let dW1 = 1.0 / m * (dZ1 * X.Transpose())
    let db1 = 1.0 / m * dZ1.RowSums()
   
    {
        dW1 = dW1;
        db1 = db1;
        dW2 = dW2;
        db2 = db2;
    }
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
let test_params = {
                    W1 = matrix[
                                    [-0.00416758; -0.00056267];
                                    [-0.02136196;  0.01640271];
                                    [-0.01793436; -0.00841747];
                                    [ 0.00502881; -0.01245288];
                                ];
                    W2 = matrix [
                                    [-0.01057952; -0.00909008;  0.00551454;  0.02292208]
                                ];
                    b1 = vector [ 0.0 ; 0.0; 0.0; 0.0];
                    b2 = vector [ 0.0];
                }
       
       
let test_cache = {
                    A1 = matrix[
                                    [-0.00616578;  0.0020626 ;  0.00349619];
                                    [-0.05225116;  0.02725659; -0.02646251];
                                    [-0.02009721;  0.0036869 ;  0.02883756];
                                    [ 0.02152675; -0.01385234;  0.02599885];
                                ];
                   A2 = vector [ 0.5002307;  0.49985831;  0.50023963];
                   Z1 = matrix [
                                   [-0.00616586;  0.0020626;  0.0034962 ];
                                   [-0.05229879;  0.02726335; -0.02646869];
                                   [-0.02009991;  0.00368692;  0.02884556];
                                   [ 0.02153007; -0.01385322;  0.02600471];
                                ];
                   Z2 = matrix [ [ 0.00092281; -0.00056678;  0.00095853] ];
                }

let test_X = matrix [
                        [ 1.62434536; -0.61175641; -0.52817175 ];
                        [ -1.07296862;  0.86540763; -2.3015387 ];
                    ]

let test_Y = vector [ 1.0; 0.0; 1.0]

let prop_result = backward_propagation test_params test_cache test_X test_Y
prop_result

{dW2 = seq [0.0007884060323; 0.01765429082; -0.0008416578262; -0.01022527031];
db2 = -0.16655712;
dW1 =
DenseMatrix 4×2-Double
0.00301023 -0.00747267
0.00257968 -0.00641288
-0.00156892 0.003893
-0.00652037 0.0161824
;
db1 = seq [0.001762013337; 0.001509948005; -0.0009173633898; -0.003814217874];}

Expected output:

**dW1** [[ 0.00301023 -0.00747267]
[ 0.00257968 -0.00641288]
[-0.00156892 0.003893 ]
[-0.00652037 0.01618243]]
**db1** [[ 0.00176201]
[ 0.00150995]
[-0.00091736]
[-0.00381422]]
**dW2** [[ 0.00078841 0.01765429 -0.00084166 -0.01022527]]
**db2** [[-0.16655712]]

We will now implement the update rule. Use gradient descent. We have to use (dW1, db1, dW2, db2) in order to update (W1, b1, W2, b2).

General gradient descent rule: $ \theta = \theta – \alpha \frac{\partial J }{ \partial \theta }$ where $\alpha$ is the learning rate and $\theta$ represents a parameter.

Illustration: The gradient descent algorithm with a good learning rate (converging) and a bad learning rate (diverging).

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
let update_parameters (parameters:NnParameter) (grads:BackwardPropagationResult) (learning_rate:float) =

    let mutable W1 = parameters.W1
    let mutable b1 = parameters.b1
    let mutable W2 = parameters.W2
    let mutable b2 = parameters.b2
   
    let dW1 = grads.dW1
    let db1 = grads.db1
    let dW2 = grads.dW2
    let db2 = grads.db2
   
    //Update rule for each parameter
    W1 <- W1 - dW1 * learning_rate
    b1 <- b1 - db1 * learning_rate
    W2 <- W2 - (dW2.ToRowMatrix()) * learning_rate
    b2 <- b2 - db2 * learning_rate
     
    {
        W1 = W1;
        b1 = b1;
        W2 = W2;
        b2 = b2;
    }

4.4 – Integrate parts 4.1, 4.2 and 4.3 in nn_model

We can now put all these functions together and build our neural network model in a nn_model function.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
let nn_model(X:Matrix<float>) (Y:Vector<float>) (n_h:int) (num_iterations:int) (learning_rate:float) (print_cost:bool) =

    let layer_sizes = layer_sizes X Y n_h
   
    let n_x = layer_sizes.[0]
    let n_y = layer_sizes.[2]
   
    //Initialize parameters, then retrieve W1, b1, W2, b2. Inputs: "n_x, n_h, n_y". Outputs = "W1, b1, W2, b2, parameters".
    let mutable parameters = initialize_parameters n_x n_h n_y
    let W1 = parameters.W1
    let b1 = parameters.b1
    let W2 = parameters.W2
    let b2 = parameters.b2
   
    //Loop (gradient descent)
    for i in 0 .. num_iterations do
         
        //Forward propagation. Inputs: "X, parameters". Outputs: "A2, cache".
        let A2, cache = forward_propagation X parameters
       
        //Cost function. Inputs: "A2, Y". Outputs: "cost".
        let cost = compute_cost A2 Y
 
        //Backpropagation. Inputs: "parameters, cache, X, Y". Outputs: "grads".
        let grads = backward_propagation parameters cache X Y
 
        //Gradient descent parameter update. Inputs: "parameters, grads". Outputs: "parameters".
        parameters <- update_parameters parameters grads learning_rate
       
        //Print the cost every 1000 iterations
        if print_cost && i % 1000 = 0 then (printfn "Cost after iteration %i: %f" i cost)

    parameters

4.5 Predictions

We will use our model to predict by building the predict function.
To do so , we’ll just use use forward propagation to predict results.

Reminder: predictions = $y_{prediction} = \mathbb 1 \text{{activation > 0.5}} = \begin{cases}
1 & \text{if}\ activation > 0.5 \\
0 & \text{otherwise}
\end{cases}$

1
2
3
let predict (X:Matrix<float>) (parameters:NnParameter) =
    let A2, cache = forward_propagation X parameters
    A2 |> Vector.map(fun x -> if x > 0.5 then 1.0 else 0.0)

It is time to run the model and see how it performs on a planar dataset.

1
2
//Build a model with a n_h-dimensional hidden layer
let nn_learned_parameters = nn_model X Y 4 10000 1.2 true

Cost after iteration 0: 0.693134
Cost after iteration 1000: 0.281911
Cost after iteration 2000: 0.269387
Cost after iteration 3000: 0.261995
Cost after iteration 4000: 0.237597
Cost after iteration 5000: 0.225616
Cost after iteration 6000: 0.221424
Cost after iteration 7000: 0.218769
Cost after iteration 8000: 0.216774
Cost after iteration 9000: 0.215168
Cost after iteration 10000: 0.217994

As we did withe the logitic regression model, we will plot decision boundary for our neural network model.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
let nn_extract_heatmap_serie startx starty endx endy steps nnParameter =
    seq {
        let x = generate_seq_by startx endx steps
        let y = generate_seq_by starty endy steps
       
        for value in x do
            let pair_as_matrix = generate_key_value_list value y |> DenseMatrix.ofColumnSeq
            let predictions = predict pair_as_matrix nnParameter
            yield predictions |> Vector.toList
        }
       
let logisctic_heatmap = nn_extract_heatmap_serie -4.0 -4.0 4.0 4.0 0.01 nn_learned_parameters |> Seq.toList
let grid_values = generate_seq_by -4.0 4.0 0.01 |> Seq.toList

let heatmap_trace = Heatmap (
                        z = logisctic_heatmap,
                        x = grid_values,
                        y = grid_values,
                        showlegend = false,
                        showscale = false
                        ) :> Trace
                       
[heatmap_trace; scatter_trace_1; scatter_trace_2]                    
    |> Chart.Plot
    |> Chart.WithTitle "Decision Boundary for hidden layer size 4"
    |> Chart.WithXTitle "X1"
    |> Chart.WithYTitle "X2"
    |> Chart.WithLegend false

4.6 – Tuning hidden layer size

Our 4 hidden neurons network model is performing quite well on the dataset. Now let’s dig a bit further and see how the network performs when changing amount of neurons in our hidden layer.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
let build_models_different_hidden_layer_size sizes =
    let mutable i = 1
    seq {
        for size in sizes do
            let nn_lp = nn_model X Y size 5000 1.2 true
            let logisctic_heatmap = nn_extract_heatmap_serie -4.0 -4.0 4.0 4.0 0.01 nn_lp |> Seq.toList
            let grid_values = generate_seq_by -4.0 4.0 0.01 |> Seq.toList
           
            yield Heatmap  (
                                z = logisctic_heatmap,
                                x = grid_values,
                                y = grid_values,
                                showlegend = false,
                                showscale = false,
                                xaxis = "x" + (string i),
                                yaxis = "y" + (string i)
                            ) :> Trace
            i <- i + 1
        }

let models_traces = build_models_different_hidden_layer_size [| 1; 2; 3; 4; 5; 20; 50 |] |> Seq.toList

let layout = Layout(
                        xaxis = Xaxis ( domain = [ 0.0; 0.45], title="X1" ),
                        yaxis = Yaxis( anchor = "x2", title="X2" ),
                        xaxis2 = Xaxis ( domain = [0.55; 1.0], title="X1" ),
                        yaxis2 = Yaxis( anchor = "x2", title="X2" )
                    )

models_traces                    
        |> Chart.Plot
        |> Chart.WithLayout layout
        |> Chart.WithLegend false

Interpretation:

  • The larger models (with more hidden units) are able to fit the training set better, until eventually the largest models overfit the data.
  • The best hidden layer size seems to be around n_h = 5.
  • We will also learn later about regularization, which lets you use very large models (such as n_h = 50) without much overfitting.

You’ve learnt to:

  • Build a complete neural network with a hidden layer
  • Make a good use of a non-linear unit
  • Implemented forward propagation and backpropagation, and trained a neural network
  • See the impact of varying the hidden layer size, including overfitting.

Leave a Reply

Your email address will not be published. Required fields are marked *