Files
machine-learning/car-sales/car-sales-data-manufacture.ipynb
T
QkoSad 42a130e272 .
2025-07-16 16:10:55 +03:00

3613 lines
84 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Creating fake data for car_sales (to make it a bit bigger)\n",
"\n",
"This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"car_sales = pd.read_csv('../data/car-sales.csv')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>150043</td>\n",
" <td>4</td>\n",
" <td>$4,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>87899</td>\n",
" <td>4</td>\n",
" <td>$5,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>32549</td>\n",
" <td>3</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>BMW</td>\n",
" <td>Black</td>\n",
" <td>11179</td>\n",
" <td>5</td>\n",
" <td>$22,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>213095</td>\n",
" <td>4</td>\n",
" <td>$3,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Toyota</td>\n",
" <td>Green</td>\n",
" <td>99213</td>\n",
" <td>4</td>\n",
" <td>$4,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>45698</td>\n",
" <td>4</td>\n",
" <td>$7,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>54738</td>\n",
" <td>4</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>60000</td>\n",
" <td>4</td>\n",
" <td>$6,250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>31600</td>\n",
" <td>4</td>\n",
" <td>$9,700.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Make.unique()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Toyota 4\n",
"Honda 3\n",
"Nissan 2\n",
"BMW 1\n",
"Name: Make, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Make.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Make\" data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(393,\n",
" ['Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota'])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create fake \"Make\" data\n",
"\n",
"toyota = [\"Toyota\" for i in range(0, 393)]\n",
"len(toyota), toyota[:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(304,\n",
" ['Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda'])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"honda = [\"Honda\" for i in range(0, 304)]\n",
"len(honda), honda[:10]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(198,\n",
" ['Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan'])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nissan = [\"Nissan\" for i in range(0, 198)]\n",
"len(nissan), nissan[:10]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bmw = [\"BMW\" for i in range(0, 100)]\n",
"len(bmw), bmw[:10]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"makes = bmw+nissan+toyota+honda\n",
"len(makes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Colour\" data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Colour.unique()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"White 4\n",
"Blue 3\n",
"Green 1\n",
"Black 1\n",
"Red 1\n",
"Name: Colour, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Colour.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(407, ['White', 'White', 'White'])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"white = [\"White\" for i in range(0, 407)]\n",
"len(white), white[:3]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(321, ['Blue', 'Blue', 'Blue'])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"blue = [\"Blue\" for i in range(0, 321)]\n",
"len(blue), blue[:3]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79, ['Green', 'Green', 'Green'])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"green = [\"Green\" for i in range(0, 79)]\n",
"len(green), green[:3]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(99, ['Black', 'Black', 'Black'])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"black = [\"Black\" for i in range(0, 99)]\n",
"len(black), black[:3]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(94, ['Red', 'Red', 'Red'])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"red = [\"Red\" for i in range(0, 94)]\n",
"len(red), red[:3]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colours = white+blue+green+black+red\n",
"len(colours)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" ['White',\n",
" 'White',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'White',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'Red',\n",
" 'White'])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random\n",
"colours_shuffled = random.sample(colours, len(colours))\n",
"len(colours_shuffled), colours_shuffled[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake Odometer (KM) data"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>150043</td>\n",
" <td>4</td>\n",
" <td>$4,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>87899</td>\n",
" <td>4</td>\n",
" <td>$5,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>32549</td>\n",
" <td>3</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>BMW</td>\n",
" <td>Black</td>\n",
" <td>11179</td>\n",
" <td>5</td>\n",
" <td>$22,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>213095</td>\n",
" <td>4</td>\n",
" <td>$3,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Toyota</td>\n",
" <td>Green</td>\n",
" <td>99213</td>\n",
" <td>4</td>\n",
" <td>$4,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>45698</td>\n",
" <td>4</td>\n",
" <td>$7,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>54738</td>\n",
" <td>4</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>60000</td>\n",
" <td>4</td>\n",
" <td>$6,250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>31600</td>\n",
" <td>4</td>\n",
" <td>$9,700.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" [195419,\n",
" 69066,\n",
" 209466,\n",
" 79301,\n",
" 134103,\n",
" 143651,\n",
" 245427,\n",
" 244095,\n",
" 176660,\n",
" 194189])"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"odometer = [random.randint(9789, 250000) for i in range(0, 1000)]\n",
"len(odometer), odometer[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Doors\" data"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"five_doors = [5 for i in range(0, 79)]\n",
"three_doors = [3 for i in range(0, 65)]\n",
"four_doors = [4 for i in range(0, 856)]\n",
"doors = five_doors + three_doors + four_doors\n",
"doors_shuffled = random.sample(doors, len(doors))"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 3,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doors_shuffled"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Price\" data"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Toyota 398\n",
"Honda 304\n",
"Nissan 198\n",
"BMW 100\n",
"dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"makes_series = pd.Series(makes)\n",
"makes_series.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>150043</td>\n",
" <td>4</td>\n",
" <td>$4,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>87899</td>\n",
" <td>4</td>\n",
" <td>$5,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>32549</td>\n",
" <td>3</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>BMW</td>\n",
" <td>Black</td>\n",
" <td>11179</td>\n",
" <td>5</td>\n",
" <td>$22,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>213095</td>\n",
" <td>4</td>\n",
" <td>$3,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Toyota</td>\n",
" <td>Green</td>\n",
" <td>99213</td>\n",
" <td>4</td>\n",
" <td>$4,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>45698</td>\n",
" <td>4</td>\n",
" <td>$7,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>54738</td>\n",
" <td>4</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>60000</td>\n",
" <td>4</td>\n",
" <td>$6,250.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>31600</td>\n",
" <td>4</td>\n",
" <td>$9,700.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>150043</td>\n",
" <td>4</td>\n",
" <td>$4,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>32549</td>\n",
" <td>3</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Toyota</td>\n",
" <td>Green</td>\n",
" <td>99213</td>\n",
" <td>4</td>\n",
" <td>$4,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>60000</td>\n",
" <td>4</td>\n",
" <td>$6,250.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"8 Toyota White 60000 4 $6,250.00"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Toyota\"]"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>87899</td>\n",
" <td>4</td>\n",
" <td>$5,000.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>45698</td>\n",
" <td>4</td>\n",
" <td>$7,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>54738</td>\n",
" <td>4</td>\n",
" <td>$7,000.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"1 Honda Red 87899 4 $5,000.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Honda\"]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>213095</td>\n",
" <td>4</td>\n",
" <td>$3,500.00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>31600</td>\n",
" <td>4</td>\n",
" <td>$9,700.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"4 Nissan White 213095 4 $3,500.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Nissan\"]"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" [27185,\n",
" 8815,\n",
" 23614,\n",
" 19783,\n",
" 29208,\n",
" 17251,\n",
" 10138,\n",
" 27640,\n",
" 7332,\n",
" 9946,\n",
" 29670,\n",
" 12779,\n",
" 26735,\n",
" 21481,\n",
" 9313,\n",
" 13094,\n",
" 17684,\n",
" 21389,\n",
" 5239,\n",
" 16733,\n",
" 19670,\n",
" 10542,\n",
" 11122,\n",
" 21311,\n",
" 29545,\n",
" 20601,\n",
" 22714,\n",
" 28876,\n",
" 14063,\n",
" 5491])"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prices = [random.randint(5000, 30000) for i in range(0, 1000)]\n",
"len(prices), prices[:30]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create base dataframe with manufactured data"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Make, Colour, Odometer (KM), Doors, Price]\n",
"Index: []"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales = pd.DataFrame(columns = [\"Make\", \"Colour\", \"Odometer (KM)\", \"Doors\", \"Price\"])\n",
"fake_sales"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"fake_sales[\"Make\"] = makes\n",
"fake_sales[\"Colour\"] = colours_shuffled\n",
"fake_sales[\"Odometer (KM)\"] = odometer\n",
"fake_sales[\"Doors\"] = doors\n",
"fake_sales[\"Price\"] = prices"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>BMW</td>\n",
" <td>White</td>\n",
" <td>195419</td>\n",
" <td>5</td>\n",
" <td>27185</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BMW</td>\n",
" <td>White</td>\n",
" <td>69066</td>\n",
" <td>5</td>\n",
" <td>8815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>209466</td>\n",
" <td>5</td>\n",
" <td>23614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>79301</td>\n",
" <td>5</td>\n",
" <td>19783</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>134103</td>\n",
" <td>5</td>\n",
" <td>29208</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 BMW White 195419 5 27185\n",
"1 BMW White 69066 5 8815\n",
"2 BMW Blue 209466 5 23614\n",
"3 BMW Blue 79301 5 19783\n",
"4 BMW Blue 134103 5 29208"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Adjust the price column\n",
"\n",
"For the price column:\n",
"* Generate random numbers between the certain values\n",
"* If the Odometer reading is above 100K, multiply price by 0.75\n",
"* If the Odometer reading is above 150K, multiply price by 0.6\n",
"* If the Odometer reading is above 200K, multiply price by 0.5\n",
"* If the Make column is BMW, multiply price by 1.5 + 2500\n",
"* If the Make column is Toyota, multuply price by 1.2\n",
"* If the Make is Nissan, multiply price by 1.1\n",
"* If the Make is Honda, add $1000 to price"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 17369.943000\n",
"std 7260.398755\n",
"min 5005.000000\n",
"25% 11039.500000\n",
"50% 17427.500000\n",
"75% 23353.500000\n",
"max 29990.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 13151.713000\n",
"std 6722.177036\n",
"min 2509.000000\n",
"25% 7854.750000\n",
"50% 12016.000000\n",
"75% 17082.250000\n",
"max 29990.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def price_od(price, odometer):\n",
" \"\"\"\n",
" Changes price according to Odometer values.\n",
" \"\"\"\n",
" if 100000 <= odometer <= 150000:\n",
" return round(price * 0.75)\n",
" elif 150001 <= odometer <= 200000:\n",
" return round(price * 0.6)\n",
" elif 200001 <= odometer:\n",
" return round(price * 0.5)\n",
" else:\n",
" return price\n",
"\n",
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_od(x[\"Price\"], \n",
" x[\"Odometer (KM)\"]), \n",
" axis=1)\n",
"\n",
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 16045.665000\n",
"std 8630.794219\n",
"min 2796.000000\n",
"25% 9481.500000\n",
"50% 14264.000000\n",
"75% 20738.750000\n",
"max 52458.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def price_make(price, make):\n",
" \"\"\"\n",
" Manipulates the price base on the cars make.\n",
" \"\"\"\n",
" if make == \"BMW\":\n",
" return round((price * 1.5) + random.randint(3000, 10000))\n",
" elif make == \"Toyota\":\n",
" return round(price * 1.2)\n",
" elif make == \"Nissan\":\n",
" return round(price * 1.1)\n",
" elif make == \"Honda\":\n",
" return round(price + 1000)\n",
" else:\n",
" return price\n",
"\n",
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_make(x[\"Price\"], \n",
" x[\"Make\"]), \n",
" axis=1)\n",
"\n",
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"fake_sales = fake_sales.sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>35431</td>\n",
" <td>4</td>\n",
" <td>15323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>192714</td>\n",
" <td>5</td>\n",
" <td>19943</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>84714</td>\n",
" <td>4</td>\n",
" <td>28343</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>154365</td>\n",
" <td>4</td>\n",
" <td>13434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>181577</td>\n",
" <td>3</td>\n",
" <td>14043</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>42652</td>\n",
" <td>4</td>\n",
" <td>23883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>163453</td>\n",
" <td>4</td>\n",
" <td>8473</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>43120</td>\n",
" <td>4</td>\n",
" <td>20306</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>130538</td>\n",
" <td>4</td>\n",
" <td>9374</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>51029</td>\n",
" <td>4</td>\n",
" <td>26683</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431 4 15323\n",
"1 BMW Blue 192714 5 19943\n",
"2 Honda White 84714 4 28343\n",
"3 Toyota White 154365 4 13434\n",
"4 Nissan Blue 181577 3 14043\n",
"5 Honda Red 42652 4 23883\n",
"6 Toyota Blue 163453 4 8473\n",
"7 Honda White 43120 4 20306\n",
"8 Nissan White 130538 4 9374\n",
"9 Honda Blue 51029 4 26683"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales.reset_index(drop=True, inplace=True)\n",
"fake_sales.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NEXT:\n",
"* Drop some values at random (to manufacture missing data)\n",
"* Build a random forest model to predict (this will involve changing categories to numerical data)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"# Export the data\n",
"fake_sales.to_csv(\"../data/car-sales-extended.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make missing data in car_sales_extended"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"sales_ext = pd.read_csv(\"../data/car-sales-extended.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sales_ext)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>35431</td>\n",
" <td>4</td>\n",
" <td>15323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>192714</td>\n",
" <td>5</td>\n",
" <td>19943</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>84714</td>\n",
" <td>4</td>\n",
" <td>28343</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>154365</td>\n",
" <td>4</td>\n",
" <td>13434</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>181577</td>\n",
" <td>3</td>\n",
" <td>14043</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>995</th>\n",
" <td>Toyota</td>\n",
" <td>Black</td>\n",
" <td>35820</td>\n",
" <td>4</td>\n",
" <td>32042</td>\n",
" </tr>\n",
" <tr>\n",
" <th>996</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>155144</td>\n",
" <td>3</td>\n",
" <td>5716</td>\n",
" </tr>\n",
" <tr>\n",
" <th>997</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>66604</td>\n",
" <td>4</td>\n",
" <td>31570</td>\n",
" </tr>\n",
" <tr>\n",
" <th>998</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>215883</td>\n",
" <td>4</td>\n",
" <td>4001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>999</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>248360</td>\n",
" <td>4</td>\n",
" <td>12732</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1000 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431 4 15323\n",
"1 BMW Blue 192714 5 19943\n",
"2 Honda White 84714 4 28343\n",
"3 Toyota White 154365 4 13434\n",
"4 Nissan Blue 181577 3 14043\n",
".. ... ... ... ... ...\n",
"995 Toyota Black 35820 4 32042\n",
"996 Nissan White 155144 3 5716\n",
"997 Nissan Blue 66604 4 31570\n",
"998 Honda White 215883 4 4001\n",
"999 Toyota Blue 248360 4 12732\n",
"\n",
"[1000 rows x 5 columns]"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What we want to do\n",
"* Remove some rows values or replace them at random\n",
" * E.g. replace strings with empty strings (\"\")\n",
" * And numbers with NaN or something similar...\n",
"* Want to keep the number of samples the same, order the same, just put some holes in it\n",
"\n",
"One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies."
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# Replicate the df\n",
"sales_ext_dropped = sales_ext"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"# Make column\n",
"np.random.seed(10)\n",
"make_idx = np.random.randint(0, 1000, 50)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881,\n",
" 8, 73, 256, 490, 40, 502, 420, 371, 528, 356, 239, 395, 54,\n",
" 344, 363, 122, 574, 545, 200, 868, 974, 689, 691, 54, 77, 453,\n",
" 13, 755, 409, 382, 653, 860, 342, 798, 670, 89, 652])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"make_idx"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"for value in make_idx:\n",
" sales_ext_dropped.loc[value, \"Make\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Honda'"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext_dropped[\"Make\"][266]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# Colour column\n",
"np.random.seed(42)\n",
"colour_idx = np.random.randint(0, 1000, 50)\n",
"for value in colour_idx:\n",
" sales_ext_dropped.loc[value, \"Colour\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"# Odometer (KM) column\n",
"np.random.seed(1)\n",
"odom_idx = np.random.randint(0, 1000, 50)\n",
"for value in odom_idx:\n",
" sales_ext_dropped.loc[value, \"Odometer (KM)\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"# Doors column\n",
"np.random.seed(2)\n",
"door_idx = np.random.randint(0, 1000, 50)\n",
"for value in door_idx:\n",
" sales_ext_dropped.loc[value, \"Doors\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"# Price column\n",
"np.random.seed(3)\n",
"price_idx = np.random.randint(0, 1000, 50)\n",
"for value in price_idx:\n",
" sales_ext_dropped.loc[value, \"Price\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Make</th>\n",
" <th>Colour</th>\n",
" <th>Odometer (KM)</th>\n",
" <th>Doors</th>\n",
" <th>Price</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>35431.0</td>\n",
" <td>4.0</td>\n",
" <td>15323.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>BMW</td>\n",
" <td>Blue</td>\n",
" <td>192714.0</td>\n",
" <td>5.0</td>\n",
" <td>19943.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>84714.0</td>\n",
" <td>4.0</td>\n",
" <td>28343.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>154365.0</td>\n",
" <td>4.0</td>\n",
" <td>13434.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>181577.0</td>\n",
" <td>3.0</td>\n",
" <td>14043.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Honda</td>\n",
" <td>Red</td>\n",
" <td>42652.0</td>\n",
" <td>4.0</td>\n",
" <td>23883.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>163453.0</td>\n",
" <td>4.0</td>\n",
" <td>8473.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>NaN</td>\n",
" <td>4.0</td>\n",
" <td>20306.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td></td>\n",
" <td>White</td>\n",
" <td>130538.0</td>\n",
" <td>4.0</td>\n",
" <td>9374.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>51029.0</td>\n",
" <td>4.0</td>\n",
" <td>26683.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>167421.0</td>\n",
" <td>4.0</td>\n",
" <td>16259.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Nissan</td>\n",
" <td>Green</td>\n",
" <td>17119.0</td>\n",
" <td>4.0</td>\n",
" <td>6160.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>102303.0</td>\n",
" <td>4.0</td>\n",
" <td>16909.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td></td>\n",
" <td>White</td>\n",
" <td>134181.0</td>\n",
" <td>4.0</td>\n",
" <td>11121.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>199833.0</td>\n",
" <td>4.0</td>\n",
" <td>18946.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>205592.0</td>\n",
" <td>4.0</td>\n",
" <td>16290.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Toyota</td>\n",
" <td>Red</td>\n",
" <td>96742.0</td>\n",
" <td>4.0</td>\n",
" <td>34465.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>BMW</td>\n",
" <td>White</td>\n",
" <td>194189.0</td>\n",
" <td>5.0</td>\n",
" <td>17177.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>67991.0</td>\n",
" <td>3.0</td>\n",
" <td>9109.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>215820.0</td>\n",
" <td>4.0</td>\n",
" <td>6010.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>Toyota</td>\n",
" <td></td>\n",
" <td>124844.0</td>\n",
" <td>4.0</td>\n",
" <td>24130.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>Honda</td>\n",
" <td></td>\n",
" <td>30615.0</td>\n",
" <td>4.0</td>\n",
" <td>29653.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>148744.0</td>\n",
" <td>4.0</td>\n",
" <td>22489.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>Honda</td>\n",
" <td>Green</td>\n",
" <td>130075.0</td>\n",
" <td>4.0</td>\n",
" <td>21242.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>172718.0</td>\n",
" <td>4.0</td>\n",
" <td>14274.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>125819.0</td>\n",
" <td>4.0</td>\n",
" <td>15686.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>180390.0</td>\n",
" <td>4.0</td>\n",
" <td>13344.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>Honda</td>\n",
" <td>Green</td>\n",
" <td>82783.0</td>\n",
" <td>4.0</td>\n",
" <td>10984.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>Honda</td>\n",
" <td>White</td>\n",
" <td>56687.0</td>\n",
" <td>4.0</td>\n",
" <td>6135.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>112004.0</td>\n",
" <td>4.0</td>\n",
" <td>13586.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>34024.0</td>\n",
" <td>4.0</td>\n",
" <td>23929.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>108569.0</td>\n",
" <td>NaN</td>\n",
" <td>6866.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>203795.0</td>\n",
" <td>4.0</td>\n",
" <td>9330.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>Nissan</td>\n",
" <td>Green</td>\n",
" <td>153554.0</td>\n",
" <td>3.0</td>\n",
" <td>9780.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>71949.0</td>\n",
" <td>4.0</td>\n",
" <td>17516.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>Honda</td>\n",
" <td>Blue</td>\n",
" <td>191622.0</td>\n",
" <td>4.0</td>\n",
" <td>8738.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>38186.0</td>\n",
" <td>4.0</td>\n",
" <td>23438.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>NaN</td>\n",
" <td>4.0</td>\n",
" <td>15131.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>146430.0</td>\n",
" <td>3.0</td>\n",
" <td>9224.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>Toyota</td>\n",
" <td>Green</td>\n",
" <td>109868.0</td>\n",
" <td>4.0</td>\n",
" <td>6574.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td></td>\n",
" <td>White</td>\n",
" <td>177894.0</td>\n",
" <td>4.0</td>\n",
" <td>9229.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>189209.0</td>\n",
" <td>4.0</td>\n",
" <td>7100.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>Honda</td>\n",
" <td>Black</td>\n",
" <td>200490.0</td>\n",
" <td>4.0</td>\n",
" <td>6337.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>141617.0</td>\n",
" <td>4.0</td>\n",
" <td>19085.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>213893.0</td>\n",
" <td>4.0</td>\n",
" <td>5743.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>45</th>\n",
" <td>Nissan</td>\n",
" <td>Blue</td>\n",
" <td>231057.0</td>\n",
" <td>4.0</td>\n",
" <td>5925.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>Toyota</td>\n",
" <td>White</td>\n",
" <td>100938.0</td>\n",
" <td>4.0</td>\n",
" <td>25196.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>47</th>\n",
" <td>Toyota</td>\n",
" <td>Blue</td>\n",
" <td>243969.0</td>\n",
" <td>NaN</td>\n",
" <td>16138.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>48</th>\n",
" <td>Nissan</td>\n",
" <td>White</td>\n",
" <td>107096.0</td>\n",
" <td>3.0</td>\n",
" <td>6075.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>49</th>\n",
" <td>Toyota</td>\n",
" <td>Black</td>\n",
" <td>86333.0</td>\n",
" <td>4.0</td>\n",
" <td>9928.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431.0 4.0 15323.0\n",
"1 BMW Blue 192714.0 5.0 19943.0\n",
"2 Honda White 84714.0 4.0 28343.0\n",
"3 Toyota White 154365.0 4.0 13434.0\n",
"4 Nissan Blue 181577.0 3.0 14043.0\n",
"5 Honda Red 42652.0 4.0 23883.0\n",
"6 Toyota Blue 163453.0 4.0 8473.0\n",
"7 Honda White NaN 4.0 20306.0\n",
"8 White 130538.0 4.0 9374.0\n",
"9 Honda Blue 51029.0 4.0 26683.0\n",
"10 Nissan White 167421.0 4.0 16259.0\n",
"11 Nissan Green 17119.0 4.0 6160.0\n",
"12 Nissan White 102303.0 4.0 16909.0\n",
"13 White 134181.0 4.0 11121.0\n",
"14 Honda Blue 199833.0 4.0 18946.0\n",
"15 Toyota Blue 205592.0 4.0 16290.0\n",
"16 Toyota Red 96742.0 4.0 34465.0\n",
"17 BMW White 194189.0 5.0 17177.0\n",
"18 Nissan White 67991.0 3.0 9109.0\n",
"19 Nissan Blue 215820.0 4.0 6010.0\n",
"20 Toyota 124844.0 4.0 24130.0\n",
"21 Honda 30615.0 4.0 29653.0\n",
"22 Toyota White 148744.0 4.0 22489.0\n",
"23 Honda Green 130075.0 4.0 21242.0\n",
"24 Honda Blue 172718.0 4.0 14274.0\n",
"25 Honda Blue 125819.0 4.0 15686.0\n",
"26 Honda White 180390.0 4.0 13344.0\n",
"27 Honda Green 82783.0 4.0 10984.0\n",
"28 Honda White 56687.0 4.0 6135.0\n",
"29 Toyota White 112004.0 4.0 13586.0\n",
"30 Nissan Blue 34024.0 4.0 23929.0\n",
"31 Toyota White 108569.0 NaN 6866.0\n",
"32 Nissan White 203795.0 4.0 9330.0\n",
"33 Nissan Green 153554.0 3.0 9780.0\n",
"34 Honda Blue 71949.0 4.0 17516.0\n",
"35 Honda Blue 191622.0 4.0 8738.0\n",
"36 Nissan Blue 38186.0 4.0 23438.0\n",
"37 Nissan White NaN 4.0 15131.0\n",
"38 Nissan Blue 146430.0 3.0 9224.0\n",
"39 Toyota Green 109868.0 4.0 6574.0\n",
"40 White 177894.0 4.0 9229.0\n",
"41 Toyota Blue 189209.0 4.0 7100.0\n",
"42 Honda Black 200490.0 4.0 6337.0\n",
"43 Toyota Blue 141617.0 4.0 19085.0\n",
"44 Toyota Blue 213893.0 4.0 5743.0\n",
"45 Nissan Blue 231057.0 4.0 5925.0\n",
"46 Toyota White 100938.0 4.0 25196.0\n",
"47 Toyota Blue 243969.0 NaN 16138.0\n",
"48 Nissan White 107096.0 3.0 6075.0\n",
"49 Toyota Black 86333.0 4.0 9928.0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext_dropped.head(50)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Make 0\n",
"Colour 0\n",
"Odometer (KM) 50\n",
"Doors 50\n",
"Price 50\n",
"dtype: int64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check how many of our values are missing/NaN\n",
"sales_ext_dropped.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"# Export dataframe with random missing values\n",
"sales_ext_dropped.to_csv(\"../data/car-sales-extended-missing-data.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}