Skip to content

Workshop #162

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions workshop/lec1/analytical_solution.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import numpy as np
import matplotlib.pyplot as plt

class regressor(object):
"""
This is a sample class for lecture 1.

Args:
data: Is a tuple, ``(x,y)``
``x`` is a two or one dimensional ndarray ordered such that axis 0 is independent
data and data is spread along axis 1. If the array had only one dimension, it implies
that data is 1D.
``y`` is a 1D ndarray it will be of the same length as axis 0 or x.

"""
def __init__(self, data):
self.x, self.y = data
# Here is where your training and all the other magic should happen.
# Once trained you should have these parameters trained.
x = np.concatenate((np.ones((self.x.shape[0],1)), self.x), axis = 1)
w = np.dot(np.linalg.pinv(np.dot(x.T,x)), np.dot(x.T,self.y))
self.w = w[1:]
self.b = w[0]

def get_params (self):
"""
Method that should return the model parameters.

Returns:
tuple of numpy.ndarray: (w, b).

Notes:
This code will return a random numpy array for demonstration purposes.

"""
return (self.w, self.b)

def get_predictions (self, x):
"""
Method should return the outputs given unseen data

Args:
x: array similar to ``x`` in ``data``. Might be of different size.

Returns:
numpy.ndarray: ``y`` which is a 1D array of predictions of the same length as axis 0 of
``x``
"""
predictions = np.add(np.dot(x, self.w), self.b)
return predictions

def plot(self, data = None):
""" Method will plot the line on an existing pyplot.
If data is provided, it will plot with the data.

Args:
data: tuple of `(x,y)`.
"""
if data is not None:
x, y = data
plt.plot(x[:], y, 'bo')
plt.axis('equal')
plt.title(' Analytical Solution for Least Squares Linear Regression')
plt.title('Amazon Employee Compensation (Linear) Dataset')
plt.xlabel('Years of experience of the employee.')
plt.ylabel('Compensation in $100,000')
grid = np.asarray([0, 1])[:,np.newaxis]
predictions = self.get_predictions(grid)
plt.plot(grid, predictions, 'r')
plt.show()

class ridge_regressor(regressor):
"""
This is a sample class for lecture 1.

Args:
data: Is a tuple, ``(x,y)``
``x`` is a two or one dimensional ndarray ordered such that axis 0 is independent
data and data is spread along axis 1. If the array had only one dimension, it implies
that data is 1D.
``y`` is a 1D ndarray it will be of the same length as axis 0 or x.
alpha: Co-efficient for L2 regularizer.

"""
def __init__(self, data, alpha = 0.0001):
self.x, self.y = data
# Here is where your training and all the other magic should happen.
# Once trained you should have these parameters trained.
x = np.concatenate((np.ones((self.x.shape[0],1)), self.x), axis = 1)
w = np.dot(np.linalg.pinv(np.dot(x.T,x) + alpha*np.eye(x.shape[1])), np.dot(x.T,self.y))
alpha
self.w = w[1:]
self.b = w[0]

if __name__ == '__main__':
pass
77 changes: 77 additions & 0 deletions workshop/lec1/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)

class dataset_generator(object):
"""
Class that creates a random dataset to be modelled by a linear regressor.

Args:
dimensions: number of dimensions of dataset (optional, default randomly 15-30)
mu: mean of the gaussian with which we add noise (optional, default 0)
sigma: variance of the gaussian with which we add noise (optional, default 0.1)
"""
def __init__(self, **kwargs):
low = 15
high = 30
if 'dimensions' in kwargs.keys():
self.dimensions = kwargs['dimensions']
else:
self.dimensions = np.random.randint(low = low,high = high)
if 'mu' in kwargs.keys():
self.mu = kwargs['mu']
else:
self.mu = 0
if 'sigma' in kwargs.keys():
self.sigma = kwargs['sigma']
else:
self.sigma = 0.07

self.w = np.random.rand(self.dimensions,1)
self.b = np.random.rand(1)

def query_data(self, **kwargs):
"""
Once initialized, this method will create more data.

Args:
samples: number of samples of data needed (optional, default randomly 10k - 50k)
Returns:
tuple: data a tuple, ``(x,y)``
``x`` is a two or one dimensional ndarray ordered such that axis 0 is independent
data and data is spread along axis 1.
``y`` is a 1D ndarray it will be of the same length as axis 0 or x.
"""
if 'samples' in kwargs.keys():
samples = kwargs['samples']
else:
samples = np.random.randint(low = 1000, high = 5000)

x = np.random.uniform(size = (samples, self.dimensions))
y = np.dot(x, self.w) + np.random.normal(self.mu, self.sigma, (samples,1)) + self.b

return (x,y)

def plot(self, x, y):
"""
This method will plot the data as created by this dataset generator.
Args:
x: as produced by the ``query_data`` method's first element.
y: as produced by the ``query_data`` method's second element.
"""
plt.plot(x[:], y, 'bo')
plt.axis('equal')
plt.title('Amazon Employee Compensation (Linear) Dataset.')
plt.xlabel('Years of experience of the employee.')
plt.ylabel('Compensation in $100,000.')
plt.show()

def demo (self, samples = 50):
"""
This is a demonstration method that will plot a version of a random dataset on the screen.

Args:
samples: number of samples of data needed (optional, default 20)
"""
x, y = self.query_data(samples = samples)
self.plot(x, y)
14 changes: 14 additions & 0 deletions workshop/lec1/errors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import numpy as np

def rmse ( a, b ):
"""
This function produces a point-wise root mean squared error error between ``a`` and ``b``

Args:
a: first input ndarray
b: second input ndarray

Returns:
numpy float: rmse error
"""
return np.sqrt(np.mean((a - b) ** 2))
Binary file added workshop/lec1/figures/regularization.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
205 changes: 205 additions & 0 deletions workshop/lec1/lec1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Lecture 1: The computational real neuron. \n",
"\n",
"Welcome to the SageMaker workshop. \n",
"\n",
"## Supervised Learning.\n",
"\n",
"**Supervised learning is the task of arriving at a mathematical mapping function from the co-variate space to the variate space using a labeled training dataset.** The training dataset is of a set of co-variate - variate sample mapping. In supervised learning, each example is a pair consisting of an input object (typically a vector) and a desired output value (also called the supervisory signal). Colloquially, various names are used for the co-variates and variates, the most common ones being 'features' and 'lables'.\n",
"\n",
"Let us create a relatable and lower-dimensional dataset to study supervised learning. Assume that you are a human resource manager at Amazon and that you are planning to make strategic human resource expansions in your department. While interviewing candidates, you would like to know antecedently how much that candidate’s pay scale is likely to be. In today’s market where data scientists are in strong demand, most candidates have a free-market value they are predisposed to expect. As a data scientist yourself, and following with Amazon's tradition of relenetlessly relying on data, you could use machine learning to model a future candidate’s potential compensation. Using this knowledge, you can negotiate during the interview. \n",
"\n",
"Let us use the compensation of all those who are already employed in your org in estimating a future employee’s pay. Say your org has $n+m$ employees. If you create a dataset of your current employees, you may come up with something that looks like the following (considering for now only the first nemployees as the training data):\n",
"$$ D = \\begin{bmatrix}\n",
" \\bf{x_1} & y_1 \\\\ \n",
" \\bf{x_2} & y_2 \\\\\n",
" \\vdots & \\vdots \\\\\n",
" \\bf{x_n} & y_n \\end{bmatrix},$$\n",
"where, $\\bf{x_i} \\in \\mathbb{R}^d$ is a d-dimensional (vector) sample where each sample represents an existing employee and each dimesnion of this ample corresponds to an attribute of the employee that is related to their compensation and $y_i \\in \\mathbb{R}^1$ is the salary of the respective employee. \n",
"\n",
"In this dataset, **to *learn* is to establish a mapping between the features and the labels.** To model the compensation of the employees, consider for now that, $x_i \\in \\mathbb{R}^1$, is a one-dimensional feature, perhaps the number of years of experience a candidate has in the field. The provided code has a data simulator that will generate some syntehtic data to mimic this scenario. The data might look like something like what is generated by the code-block below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from dataset import dataset_generator\n",
"# Initialize a dataset creator\n",
"dataset = dataset_generator(dimensions = 1)\n",
"dataset.demo()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Formally, Supervised learning is the process of establishing through a model, the relationship or mapping between $(\\bf{x} ,y)$ using $(\\bf{x_i},y_i) \\in D \\forall i \\in [i,n]$, such that once established, given a new sample $x_{n+j} \\notin D$ and $j<m$, the model estimates $y_{n+j}$. Informally, we want to learn a model using the training part of the dataset, such that we should be able to predict the associated label for samples from the remaining. The dataset $D$ is called the training dataset and the set of samples $x_i, i \\in (n, n + m]$ is called the generalization dataset. The generalization dataset is typically the real world. If we have the knowledge of the actual targets of the samples in the generalization set, we call it the testing set, as we can use it to test the quality of our model before deploying it in the real world.\n",
"\n",
"Models are usually of the form $\\hat{y} = g(\\bf{X},w)$ where, $w$ are the parameters or weights of the model that transform $X$ to $\\hat{y}$. In short, a model is a functional form that was predetermined that depends on some to-be determined parameters whose values are to be learnt using the data.\n",
"\n",
"## Least Squares Linear Regression.\n",
"\n",
"Let us posit that the experience of the candidates and their compensation are **linearly related**. What this means is that we are making a decision that the relationship between the candidates’ experience and the salaries is captured by a straight line. With this assumption, we have are limiting the architecture of our problem to linear models and converted our problem into a linear regression problem. Essentially, if our data is $x \\in \\mathbb{R}^1$, then our prediction is, \n",
"$$ \\hat{y} = w_1x + b.$$\n",
"If $\\bf{x} \\in \\mathbb{R}^d $, then \n",
"$$ \\hat{y} = \\sum_{i=1}^d w_ix^i + b.$$\n",
"\n",
"To know how good our predictions are we need some metric to measure our errors. Consider the root-mean-squared error or the RMSE,\n",
"$$ e_i(\\bf{w}) = \\vert \\vert \\hat{y_i} - y_i \\vert \\vert_2, $$\n",
"which, will tell us **how *far* away our prediction $\\hat{y_i}$ is from the actual value $y_i, \\forall i \\in [0,n]$ in the Euclidean sense**. For our entire dataset, we can have a cumulative error defined as,\n",
"$$e(\\bf{w}) = \\sum_{i=1}^n \\vert \\vert y_i - \\hat{y_i} \\vert \\vert_2,$$\n",
"or,\n",
"$$ e(\\bf{w}) = \\sum_{i=1}^n \\vert \\vert y_i - W^TX + b \\vert \\vert_2.$$\n",
"\n",
"This error is often referred to as the objective. This is what we want to **minimize**. We want those parameters $w$, that will get us to be as low as possible $e(w)$. Formally, we want,\n",
"$$ \\hat{w} = \\arg\\min_w e(w). $$\n",
"We can derive a solution for this optimization problem analytically.\n",
"$$ e(w) = \\frac{1}{2}(y-w^TX)^T(y-w^TX),$$\n",
"$$\\frac{\\partial e}{\\partial w} = -X^Tt + X^TXw,$$\n",
"equating this to zero to obtain minima we get,\n",
"$$X^TXw = X^Ty,$$\n",
"$$\\hat{w} = (X^TX)^{-1}X^Ty.$$\n",
"$\\hat{w}$ is will give us the minimum most error possible and this solution is called the analytical solution.\n",
"\n",
"### Implementing the analytical solution.\n",
"\n",
"Let us run our dataset through this analytical solution and see if it will work. The regressor base class is defined in [analytical_solution](analytical_solution.py) file. The core of this code is [line 21](analytical_solution.py#L21), which is the following.\n",
"```python \n",
"w = np.dot(np.linalg.pinv(np.dot(x.T,x)), np.dot(x.T,self.y))\n",
"```\n",
"As can be seen, it is a direct implementation of the analytical solution.\n",
"\n",
"To see this in action let us create a training dataset of 40 samples from our generator and use our regressor to estimate the analytical $w$. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from analytical_solution import regressor\n",
"data_train = dataset.query_data(samples = 40) # Create a training dataset. \n",
"r = regressor(data_train) # This call should return a regressor object that is fully trained.\n",
"reg_params = r.get_params() # This call should return parameters of the model that are \n",
" # fully trained."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let us see how well we are doing. Let us use the same dataset distribution to query 40 additional samples for testing. We can use this testing dataset to make our predictions for the linear regressor. Once we have the predictions, we can use the RMSE to check how well we are doing."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from errors import rmse\n",
"data_test = dataset.query_data(samples = 40) # Create a random testing dataset.\n",
"predictions = r.get_predictions(data_test[0]) # This call should return predictions.\n",
"print (\"Rmse error of predictions = \" + str(rmse(data_test[1], predictions)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The model is a line. We should be able to visualize this line as shown below."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"r.plot(data_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can clearly see here that our simple model works pretty fine. Although for this simple linear model an analytical solution does exist, we find that for more complex problem structures we have to rely on some optimization procedures that are described in the later lectures.\n",
"\n",
"## Ridge Regression.\n",
"\n",
"We used a ``numpy.linalg.pinv`` to solve this problem. We did this because **not always is $x^Tx$ invertible**. What can we do in our analytical solution to make this invertible? One thing that can be done to make this solution more stable is to ensure that the diagonal elements of $w^Tw$ behave nicely. Consider the following analytical solution for $\\hat{w}$,\n",
"$$\\hat{w} = (X^TX + \\alpha_2I)^{-1}X^Ty.$$\n",
"In this solution, you can be quite sure that this will give a reasonablly good solution. What is this a solution for? \n",
"Consider the error function,\n",
"$$e(w)=(y-w^Tx)^T(y-w^Tx) + \\alpha_2wTw.$$\n",
"Now,\n",
"$$\\frac{\\partial e}{\\partial w} = \\frac{\\partial e}{\\partial w} ( w^Tx^Txw - 2y^Txw + y^Ty + \\alpha_2w^Tw),$$\n",
"$$ = 2x^Txw - 2x^Ty + 2\\alpha_2I,$$\n",
"$$ = 2(x^Tx + \\alpha_2I)w - 2X^Ty,$$\n",
"which, when equated to zero to obtain the minima we get,\n",
"$$(x^Tx + \\alpha_2I)w = X^Ty,$$\n",
"$$\\hat{w} = (X^TX + \\alpha_2I)^{-1}X^Ty.$$"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from analytical_solution import ridge_regressor\n",
"data_train = dataset.query_data(samples = 40) # Create a training dataset. \n",
"r = ridge_regressor(data_train, alpha = 0.0001) # This call should return a regressor object that is fully trained.\n",
"ridge_params = r.get_params() # This call should return parameters of the model that are \n",
" # fully trained.\n",
"data_test = dataset.query_data(samples = 40) # Create a random testing dataset.\n",
"predictions = r.get_predictions(data_test[0]) # This call should return predictions.\n",
"print (\"Rmse error of predictions = \" + str(rmse(data_test[1], predictions)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Geometry of the $L_2$ regularizer\n",
"![Geometry of L2 Regularization](figures/regularization.png)\n",
"\n",
"The errors we use above are squared errors. With that in mind, if we drew out the errors in the parameters space, we will get an error function which will be a *bowl*. At $\\alpha_2 = 0$, we will be at the center. There are a lot of reasons why we might not want to prefer that. For instance, Smaller weights imply that we know that our weights are stable. In the future, we will notice that smaller weights will help us with noisy data or even to enforce sparsity. We will also see other types of regularizers in later lectures."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading