3613 lines
84 KiB
Plaintext
3613 lines
84 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# Creating fake data for car_sales (to make it a bit bigger)\n",
|
||
"\n",
|
||
"This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"\n",
|
||
"car_sales = pd.read_csv('../data/car-sales.csv')"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>150043</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>87899</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$5,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>32549</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>11179</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>$22,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>213095</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$3,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>99213</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>45698</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>54738</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>60000</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$6,250.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>31600</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$9,700.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Toyota White 150043 4 $4,000.00\n",
|
||
"1 Honda Red 87899 4 $5,000.00\n",
|
||
"2 Toyota Blue 32549 3 $7,000.00\n",
|
||
"3 BMW Black 11179 5 $22,000.00\n",
|
||
"4 Nissan White 213095 4 $3,500.00\n",
|
||
"5 Toyota Green 99213 4 $4,500.00\n",
|
||
"6 Honda Blue 45698 4 $7,500.00\n",
|
||
"7 Honda Blue 54738 4 $7,000.00\n",
|
||
"8 Toyota White 60000 4 $6,250.00\n",
|
||
"9 Nissan White 31600 4 $9,700.00"
|
||
]
|
||
},
|
||
"execution_count": 2,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales.Make.unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Toyota 4\n",
|
||
"Honda 3\n",
|
||
"Nissan 2\n",
|
||
"BMW 1\n",
|
||
"Name: Make, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales.Make.value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create fake \"Make\" data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(393,\n",
|
||
" ['Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota',\n",
|
||
" 'Toyota'])"
|
||
]
|
||
},
|
||
"execution_count": 12,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Create fake \"Make\" data\n",
|
||
"\n",
|
||
"toyota = [\"Toyota\" for i in range(0, 393)]\n",
|
||
"len(toyota), toyota[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {
|
||
"scrolled": true
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(304,\n",
|
||
" ['Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda',\n",
|
||
" 'Honda'])"
|
||
]
|
||
},
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"honda = [\"Honda\" for i in range(0, 304)]\n",
|
||
"len(honda), honda[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(198,\n",
|
||
" ['Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan',\n",
|
||
" 'Nissan'])"
|
||
]
|
||
},
|
||
"execution_count": 14,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"nissan = [\"Nissan\" for i in range(0, 198)]\n",
|
||
"len(nissan), nissan[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(100, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])"
|
||
]
|
||
},
|
||
"execution_count": 15,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"bmw = [\"BMW\" for i in range(0, 100)]\n",
|
||
"len(bmw), bmw[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1000"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"makes = bmw+nissan+toyota+honda\n",
|
||
"len(makes)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create fake \"Colour\" data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)"
|
||
]
|
||
},
|
||
"execution_count": 20,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales.Colour.unique()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"White 4\n",
|
||
"Blue 3\n",
|
||
"Green 1\n",
|
||
"Black 1\n",
|
||
"Red 1\n",
|
||
"Name: Colour, dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 21,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales.Colour.value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(407, ['White', 'White', 'White'])"
|
||
]
|
||
},
|
||
"execution_count": 29,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"white = [\"White\" for i in range(0, 407)]\n",
|
||
"len(white), white[:3]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(321, ['Blue', 'Blue', 'Blue'])"
|
||
]
|
||
},
|
||
"execution_count": 30,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"blue = [\"Blue\" for i in range(0, 321)]\n",
|
||
"len(blue), blue[:3]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(79, ['Green', 'Green', 'Green'])"
|
||
]
|
||
},
|
||
"execution_count": 31,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"green = [\"Green\" for i in range(0, 79)]\n",
|
||
"len(green), green[:3]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(99, ['Black', 'Black', 'Black'])"
|
||
]
|
||
},
|
||
"execution_count": 32,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"black = [\"Black\" for i in range(0, 99)]\n",
|
||
"len(black), black[:3]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(94, ['Red', 'Red', 'Red'])"
|
||
]
|
||
},
|
||
"execution_count": 35,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"red = [\"Red\" for i in range(0, 94)]\n",
|
||
"len(red), red[:3]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1000"
|
||
]
|
||
},
|
||
"execution_count": 36,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"colours = white+blue+green+black+red\n",
|
||
"len(colours)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1000,\n",
|
||
" ['White',\n",
|
||
" 'White',\n",
|
||
" 'Blue',\n",
|
||
" 'Blue',\n",
|
||
" 'Blue',\n",
|
||
" 'White',\n",
|
||
" 'Blue',\n",
|
||
" 'Blue',\n",
|
||
" 'Red',\n",
|
||
" 'White'])"
|
||
]
|
||
},
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"import random\n",
|
||
"colours_shuffled = random.sample(colours, len(colours))\n",
|
||
"len(colours_shuffled), colours_shuffled[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create fake Odometer (KM) data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>150043</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>87899</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$5,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>32549</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>11179</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>$22,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>213095</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$3,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>99213</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>45698</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>54738</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>60000</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$6,250.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>31600</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$9,700.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Toyota White 150043 4 $4,000.00\n",
|
||
"1 Honda Red 87899 4 $5,000.00\n",
|
||
"2 Toyota Blue 32549 3 $7,000.00\n",
|
||
"3 BMW Black 11179 5 $22,000.00\n",
|
||
"4 Nissan White 213095 4 $3,500.00\n",
|
||
"5 Toyota Green 99213 4 $4,500.00\n",
|
||
"6 Honda Blue 45698 4 $7,500.00\n",
|
||
"7 Honda Blue 54738 4 $7,000.00\n",
|
||
"8 Toyota White 60000 4 $6,250.00\n",
|
||
"9 Nissan White 31600 4 $9,700.00"
|
||
]
|
||
},
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 64,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1000,\n",
|
||
" [195419,\n",
|
||
" 69066,\n",
|
||
" 209466,\n",
|
||
" 79301,\n",
|
||
" 134103,\n",
|
||
" 143651,\n",
|
||
" 245427,\n",
|
||
" 244095,\n",
|
||
" 176660,\n",
|
||
" 194189])"
|
||
]
|
||
},
|
||
"execution_count": 64,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"odometer = [random.randint(9789, 250000) for i in range(0, 1000)]\n",
|
||
"len(odometer), odometer[:10]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create fake \"Doors\" data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 65,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"five_doors = [5 for i in range(0, 79)]\n",
|
||
"three_doors = [3 for i in range(0, 65)]\n",
|
||
"four_doors = [4 for i in range(0, 856)]\n",
|
||
"doors = five_doors + three_doors + four_doors\n",
|
||
"doors_shuffled = random.sample(doors, len(doors))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 66,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"[4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 3,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 3,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 5,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 5,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 3,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4,\n",
|
||
" 4]"
|
||
]
|
||
},
|
||
"execution_count": 66,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"doors_shuffled"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create fake \"Price\" data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Toyota 398\n",
|
||
"Honda 304\n",
|
||
"Nissan 198\n",
|
||
"BMW 100\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"makes_series = pd.Series(makes)\n",
|
||
"makes_series.value_counts()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>150043</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>87899</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$5,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>32549</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>11179</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>$22,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>213095</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$3,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>99213</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>45698</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>54738</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>60000</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$6,250.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>31600</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$9,700.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Toyota White 150043 4 $4,000.00\n",
|
||
"1 Honda Red 87899 4 $5,000.00\n",
|
||
"2 Toyota Blue 32549 3 $7,000.00\n",
|
||
"3 BMW Black 11179 5 $22,000.00\n",
|
||
"4 Nissan White 213095 4 $3,500.00\n",
|
||
"5 Toyota Green 99213 4 $4,500.00\n",
|
||
"6 Honda Blue 45698 4 $7,500.00\n",
|
||
"7 Honda Blue 54738 4 $7,000.00\n",
|
||
"8 Toyota White 60000 4 $6,250.00\n",
|
||
"9 Nissan White 31600 4 $9,700.00"
|
||
]
|
||
},
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>150043</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>32549</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>99213</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$4,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>60000</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$6,250.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Toyota White 150043 4 $4,000.00\n",
|
||
"2 Toyota Blue 32549 3 $7,000.00\n",
|
||
"5 Toyota Green 99213 4 $4,500.00\n",
|
||
"8 Toyota White 60000 4 $6,250.00"
|
||
]
|
||
},
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales[car_sales[\"Make\"] == \"Toyota\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>87899</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$5,000.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>45698</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>54738</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$7,000.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"1 Honda Red 87899 4 $5,000.00\n",
|
||
"6 Honda Blue 45698 4 $7,500.00\n",
|
||
"7 Honda Blue 54738 4 $7,000.00"
|
||
]
|
||
},
|
||
"execution_count": 75,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales[car_sales[\"Make\"] == \"Honda\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>213095</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$3,500.00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>31600</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>$9,700.00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"4 Nissan White 213095 4 $3,500.00\n",
|
||
"9 Nissan White 31600 4 $9,700.00"
|
||
]
|
||
},
|
||
"execution_count": 76,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"car_sales[car_sales[\"Make\"] == \"Nissan\"]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 119,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"(1000,\n",
|
||
" [27185,\n",
|
||
" 8815,\n",
|
||
" 23614,\n",
|
||
" 19783,\n",
|
||
" 29208,\n",
|
||
" 17251,\n",
|
||
" 10138,\n",
|
||
" 27640,\n",
|
||
" 7332,\n",
|
||
" 9946,\n",
|
||
" 29670,\n",
|
||
" 12779,\n",
|
||
" 26735,\n",
|
||
" 21481,\n",
|
||
" 9313,\n",
|
||
" 13094,\n",
|
||
" 17684,\n",
|
||
" 21389,\n",
|
||
" 5239,\n",
|
||
" 16733,\n",
|
||
" 19670,\n",
|
||
" 10542,\n",
|
||
" 11122,\n",
|
||
" 21311,\n",
|
||
" 29545,\n",
|
||
" 20601,\n",
|
||
" 22714,\n",
|
||
" 28876,\n",
|
||
" 14063,\n",
|
||
" 5491])"
|
||
]
|
||
},
|
||
"execution_count": 119,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"prices = [random.randint(5000, 30000) for i in range(0, 1000)]\n",
|
||
"len(prices), prices[:30]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Create base dataframe with manufactured data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 136,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
"Empty DataFrame\n",
|
||
"Columns: [Make, Colour, Odometer (KM), Doors, Price]\n",
|
||
"Index: []"
|
||
]
|
||
},
|
||
"execution_count": 136,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fake_sales = pd.DataFrame(columns = [\"Make\", \"Colour\", \"Odometer (KM)\", \"Doors\", \"Price\"])\n",
|
||
"fake_sales"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 137,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"fake_sales[\"Make\"] = makes\n",
|
||
"fake_sales[\"Colour\"] = colours_shuffled\n",
|
||
"fake_sales[\"Odometer (KM)\"] = odometer\n",
|
||
"fake_sales[\"Doors\"] = doors\n",
|
||
"fake_sales[\"Price\"] = prices"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 138,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>195419</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>27185</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>69066</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>8815</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>209466</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>23614</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>79301</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>19783</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>134103</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>29208</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 BMW White 195419 5 27185\n",
|
||
"1 BMW White 69066 5 8815\n",
|
||
"2 BMW Blue 209466 5 23614\n",
|
||
"3 BMW Blue 79301 5 19783\n",
|
||
"4 BMW Blue 134103 5 29208"
|
||
]
|
||
},
|
||
"execution_count": 138,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fake_sales.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Adjust the price column\n",
|
||
"\n",
|
||
"For the price column:\n",
|
||
"* Generate random numbers between the certain values\n",
|
||
"* If the Odometer reading is above 100K, multiply price by 0.75\n",
|
||
"* If the Odometer reading is above 150K, multiply price by 0.6\n",
|
||
"* If the Odometer reading is above 200K, multiply price by 0.5\n",
|
||
"* If the Make column is BMW, multiply price by 1.5 + 2500\n",
|
||
"* If the Make column is Toyota, multuply price by 1.2\n",
|
||
"* If the Make is Nissan, multiply price by 1.1\n",
|
||
"* If the Make is Honda, add $1000 to price"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 139,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 1000.000000\n",
|
||
"mean 17369.943000\n",
|
||
"std 7260.398755\n",
|
||
"min 5005.000000\n",
|
||
"25% 11039.500000\n",
|
||
"50% 17427.500000\n",
|
||
"75% 23353.500000\n",
|
||
"max 29990.000000\n",
|
||
"Name: Price, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 139,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fake_sales[\"Price\"].describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 140,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 1000.000000\n",
|
||
"mean 13151.713000\n",
|
||
"std 6722.177036\n",
|
||
"min 2509.000000\n",
|
||
"25% 7854.750000\n",
|
||
"50% 12016.000000\n",
|
||
"75% 17082.250000\n",
|
||
"max 29990.000000\n",
|
||
"Name: Price, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 140,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def price_od(price, odometer):\n",
|
||
" \"\"\"\n",
|
||
" Changes price according to Odometer values.\n",
|
||
" \"\"\"\n",
|
||
" if 100000 <= odometer <= 150000:\n",
|
||
" return round(price * 0.75)\n",
|
||
" elif 150001 <= odometer <= 200000:\n",
|
||
" return round(price * 0.6)\n",
|
||
" elif 200001 <= odometer:\n",
|
||
" return round(price * 0.5)\n",
|
||
" else:\n",
|
||
" return price\n",
|
||
"\n",
|
||
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_od(x[\"Price\"], \n",
|
||
" x[\"Odometer (KM)\"]), \n",
|
||
" axis=1)\n",
|
||
"\n",
|
||
"fake_sales[\"Price\"].describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 141,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"count 1000.000000\n",
|
||
"mean 16045.665000\n",
|
||
"std 8630.794219\n",
|
||
"min 2796.000000\n",
|
||
"25% 9481.500000\n",
|
||
"50% 14264.000000\n",
|
||
"75% 20738.750000\n",
|
||
"max 52458.000000\n",
|
||
"Name: Price, dtype: float64"
|
||
]
|
||
},
|
||
"execution_count": 141,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"def price_make(price, make):\n",
|
||
" \"\"\"\n",
|
||
" Manipulates the price base on the cars make.\n",
|
||
" \"\"\"\n",
|
||
" if make == \"BMW\":\n",
|
||
" return round((price * 1.5) + random.randint(3000, 10000))\n",
|
||
" elif make == \"Toyota\":\n",
|
||
" return round(price * 1.2)\n",
|
||
" elif make == \"Nissan\":\n",
|
||
" return round(price * 1.1)\n",
|
||
" elif make == \"Honda\":\n",
|
||
" return round(price + 1000)\n",
|
||
" else:\n",
|
||
" return price\n",
|
||
"\n",
|
||
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_make(x[\"Price\"], \n",
|
||
" x[\"Make\"]), \n",
|
||
" axis=1)\n",
|
||
"\n",
|
||
"fake_sales[\"Price\"].describe()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 142,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"fake_sales = fake_sales.sample(frac=1)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 143,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>35431</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>15323</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>192714</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>19943</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>84714</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>28343</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>154365</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>13434</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>181577</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>14043</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>42652</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>23883</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>163453</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>8473</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>43120</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>20306</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>130538</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>9374</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>51029</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>26683</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Honda White 35431 4 15323\n",
|
||
"1 BMW Blue 192714 5 19943\n",
|
||
"2 Honda White 84714 4 28343\n",
|
||
"3 Toyota White 154365 4 13434\n",
|
||
"4 Nissan Blue 181577 3 14043\n",
|
||
"5 Honda Red 42652 4 23883\n",
|
||
"6 Toyota Blue 163453 4 8473\n",
|
||
"7 Honda White 43120 4 20306\n",
|
||
"8 Nissan White 130538 4 9374\n",
|
||
"9 Honda Blue 51029 4 26683"
|
||
]
|
||
},
|
||
"execution_count": 143,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"fake_sales.reset_index(drop=True, inplace=True)\n",
|
||
"fake_sales.head(10)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# NEXT:\n",
|
||
"* Drop some values at random (to manufacture missing data)\n",
|
||
"* Build a random forest model to predict (this will involve changing categories to numerical data)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 146,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Export the data\n",
|
||
"fake_sales.to_csv(\"../data/car-sales-extended.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Make missing data in car_sales_extended"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 61,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sales_ext = pd.read_csv(\"../data/car-sales-extended.csv\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"1000"
|
||
]
|
||
},
|
||
"execution_count": 62,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"len(sales_ext)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>35431</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>15323</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>192714</td>\n",
|
||
" <td>5</td>\n",
|
||
" <td>19943</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>84714</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>28343</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>154365</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>13434</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>181577</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>14043</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>...</th>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>...</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>995</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>35820</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>32042</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>996</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>155144</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>5716</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>997</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>66604</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>31570</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>998</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>215883</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>4001</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>999</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>248360</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>12732</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>1000 rows × 5 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Honda White 35431 4 15323\n",
|
||
"1 BMW Blue 192714 5 19943\n",
|
||
"2 Honda White 84714 4 28343\n",
|
||
"3 Toyota White 154365 4 13434\n",
|
||
"4 Nissan Blue 181577 3 14043\n",
|
||
".. ... ... ... ... ...\n",
|
||
"995 Toyota Black 35820 4 32042\n",
|
||
"996 Nissan White 155144 3 5716\n",
|
||
"997 Nissan Blue 66604 4 31570\n",
|
||
"998 Honda White 215883 4 4001\n",
|
||
"999 Toyota Blue 248360 4 12732\n",
|
||
"\n",
|
||
"[1000 rows x 5 columns]"
|
||
]
|
||
},
|
||
"execution_count": 63,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"sales_ext"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### What we want to do\n",
|
||
"* Remove some rows values or replace them at random\n",
|
||
" * E.g. replace strings with empty strings (\"\")\n",
|
||
" * And numbers with NaN or something similar...\n",
|
||
"* Want to keep the number of samples the same, order the same, just put some holes in it\n",
|
||
"\n",
|
||
"One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 64,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Replicate the df\n",
|
||
"sales_ext_dropped = sales_ext"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 65,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Make column\n",
|
||
"np.random.seed(10)\n",
|
||
"make_idx = np.random.randint(0, 1000, 50)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881,\n",
|
||
" 8, 73, 256, 490, 40, 502, 420, 371, 528, 356, 239, 395, 54,\n",
|
||
" 344, 363, 122, 574, 545, 200, 868, 974, 689, 691, 54, 77, 453,\n",
|
||
" 13, 755, 409, 382, 653, 860, 342, 798, 670, 89, 652])"
|
||
]
|
||
},
|
||
"execution_count": 77,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"make_idx"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 66,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"for value in make_idx:\n",
|
||
" sales_ext_dropped.loc[value, \"Make\"] = \"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 67,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"'Honda'"
|
||
]
|
||
},
|
||
"execution_count": 67,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"sales_ext_dropped[\"Make\"][266]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 68,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Colour column\n",
|
||
"np.random.seed(42)\n",
|
||
"colour_idx = np.random.randint(0, 1000, 50)\n",
|
||
"for value in colour_idx:\n",
|
||
" sales_ext_dropped.loc[value, \"Colour\"] = \"\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 69,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Odometer (KM) column\n",
|
||
"np.random.seed(1)\n",
|
||
"odom_idx = np.random.randint(0, 1000, 50)\n",
|
||
"for value in odom_idx:\n",
|
||
" sales_ext_dropped.loc[value, \"Odometer (KM)\"] = None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 70,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Doors column\n",
|
||
"np.random.seed(2)\n",
|
||
"door_idx = np.random.randint(0, 1000, 50)\n",
|
||
"for value in door_idx:\n",
|
||
" sales_ext_dropped.loc[value, \"Doors\"] = None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 71,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Price column\n",
|
||
"np.random.seed(3)\n",
|
||
"price_idx = np.random.randint(0, 1000, 50)\n",
|
||
"for value in price_idx:\n",
|
||
" sales_ext_dropped.loc[value, \"Price\"] = None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Make</th>\n",
|
||
" <th>Colour</th>\n",
|
||
" <th>Odometer (KM)</th>\n",
|
||
" <th>Doors</th>\n",
|
||
" <th>Price</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>35431.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15323.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>192714.0</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>19943.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>84714.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>28343.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>154365.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>13434.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>181577.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>14043.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>42652.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>23883.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>163453.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>8473.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>20306.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td></td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>130538.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>9374.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>51029.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>26683.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>167421.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>16259.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>17119.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>6160.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>102303.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>16909.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td></td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>134181.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>11121.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>199833.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>18946.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>205592.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>16290.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Red</td>\n",
|
||
" <td>96742.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>34465.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>BMW</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>194189.0</td>\n",
|
||
" <td>5.0</td>\n",
|
||
" <td>17177.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>67991.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>9109.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>215820.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>6010.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td></td>\n",
|
||
" <td>124844.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>24130.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td></td>\n",
|
||
" <td>30615.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>29653.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>148744.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>22489.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>130075.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>21242.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>172718.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>14274.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>125819.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15686.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>180390.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>13344.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>82783.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>10984.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>56687.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>6135.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>112004.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>13586.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>30</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>34024.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>23929.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>31</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>108569.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>6866.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>32</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>203795.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>9330.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>33</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>153554.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>9780.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>34</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>71949.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>17516.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>35</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>191622.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>8738.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>36</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>38186.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>23438.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>37</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>15131.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>38</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>146430.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>9224.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>39</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Green</td>\n",
|
||
" <td>109868.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>6574.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>40</th>\n",
|
||
" <td></td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>177894.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>9229.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>41</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>189209.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>7100.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>42</th>\n",
|
||
" <td>Honda</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>200490.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>6337.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>43</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>141617.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>19085.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>44</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>213893.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>5743.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>45</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>231057.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>5925.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>46</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>100938.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>25196.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>47</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Blue</td>\n",
|
||
" <td>243969.0</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>16138.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>48</th>\n",
|
||
" <td>Nissan</td>\n",
|
||
" <td>White</td>\n",
|
||
" <td>107096.0</td>\n",
|
||
" <td>3.0</td>\n",
|
||
" <td>6075.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>49</th>\n",
|
||
" <td>Toyota</td>\n",
|
||
" <td>Black</td>\n",
|
||
" <td>86333.0</td>\n",
|
||
" <td>4.0</td>\n",
|
||
" <td>9928.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Make Colour Odometer (KM) Doors Price\n",
|
||
"0 Honda White 35431.0 4.0 15323.0\n",
|
||
"1 BMW Blue 192714.0 5.0 19943.0\n",
|
||
"2 Honda White 84714.0 4.0 28343.0\n",
|
||
"3 Toyota White 154365.0 4.0 13434.0\n",
|
||
"4 Nissan Blue 181577.0 3.0 14043.0\n",
|
||
"5 Honda Red 42652.0 4.0 23883.0\n",
|
||
"6 Toyota Blue 163453.0 4.0 8473.0\n",
|
||
"7 Honda White NaN 4.0 20306.0\n",
|
||
"8 White 130538.0 4.0 9374.0\n",
|
||
"9 Honda Blue 51029.0 4.0 26683.0\n",
|
||
"10 Nissan White 167421.0 4.0 16259.0\n",
|
||
"11 Nissan Green 17119.0 4.0 6160.0\n",
|
||
"12 Nissan White 102303.0 4.0 16909.0\n",
|
||
"13 White 134181.0 4.0 11121.0\n",
|
||
"14 Honda Blue 199833.0 4.0 18946.0\n",
|
||
"15 Toyota Blue 205592.0 4.0 16290.0\n",
|
||
"16 Toyota Red 96742.0 4.0 34465.0\n",
|
||
"17 BMW White 194189.0 5.0 17177.0\n",
|
||
"18 Nissan White 67991.0 3.0 9109.0\n",
|
||
"19 Nissan Blue 215820.0 4.0 6010.0\n",
|
||
"20 Toyota 124844.0 4.0 24130.0\n",
|
||
"21 Honda 30615.0 4.0 29653.0\n",
|
||
"22 Toyota White 148744.0 4.0 22489.0\n",
|
||
"23 Honda Green 130075.0 4.0 21242.0\n",
|
||
"24 Honda Blue 172718.0 4.0 14274.0\n",
|
||
"25 Honda Blue 125819.0 4.0 15686.0\n",
|
||
"26 Honda White 180390.0 4.0 13344.0\n",
|
||
"27 Honda Green 82783.0 4.0 10984.0\n",
|
||
"28 Honda White 56687.0 4.0 6135.0\n",
|
||
"29 Toyota White 112004.0 4.0 13586.0\n",
|
||
"30 Nissan Blue 34024.0 4.0 23929.0\n",
|
||
"31 Toyota White 108569.0 NaN 6866.0\n",
|
||
"32 Nissan White 203795.0 4.0 9330.0\n",
|
||
"33 Nissan Green 153554.0 3.0 9780.0\n",
|
||
"34 Honda Blue 71949.0 4.0 17516.0\n",
|
||
"35 Honda Blue 191622.0 4.0 8738.0\n",
|
||
"36 Nissan Blue 38186.0 4.0 23438.0\n",
|
||
"37 Nissan White NaN 4.0 15131.0\n",
|
||
"38 Nissan Blue 146430.0 3.0 9224.0\n",
|
||
"39 Toyota Green 109868.0 4.0 6574.0\n",
|
||
"40 White 177894.0 4.0 9229.0\n",
|
||
"41 Toyota Blue 189209.0 4.0 7100.0\n",
|
||
"42 Honda Black 200490.0 4.0 6337.0\n",
|
||
"43 Toyota Blue 141617.0 4.0 19085.0\n",
|
||
"44 Toyota Blue 213893.0 4.0 5743.0\n",
|
||
"45 Nissan Blue 231057.0 4.0 5925.0\n",
|
||
"46 Toyota White 100938.0 4.0 25196.0\n",
|
||
"47 Toyota Blue 243969.0 NaN 16138.0\n",
|
||
"48 Nissan White 107096.0 3.0 6075.0\n",
|
||
"49 Toyota Black 86333.0 4.0 9928.0"
|
||
]
|
||
},
|
||
"execution_count": 72,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"sales_ext_dropped.head(50)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Make 0\n",
|
||
"Colour 0\n",
|
||
"Odometer (KM) 50\n",
|
||
"Doors 50\n",
|
||
"Price 50\n",
|
||
"dtype: int64"
|
||
]
|
||
},
|
||
"execution_count": 73,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Check how many of our values are missing/NaN\n",
|
||
"sales_ext_dropped.isna().sum()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 74,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# Export dataframe with random missing values\n",
|
||
"sales_ext_dropped.to_csv(\"../data/car-sales-extended-missing-data.csv\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.7.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|