{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Creating fake data for car_sales (to make it a bit bigger)\n",
"\n",
"This notebook will manufacture data for the car_sales dataframe to make it usable to explain different techniques for missing data and converting things to numbers."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"car_sales = pd.read_csv('../data/car-sales.csv')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Toyota | \n",
" White | \n",
" 150043 | \n",
" 4 | \n",
" $4,000.00 | \n",
"
\n",
" \n",
" | 1 | \n",
" Honda | \n",
" Red | \n",
" 87899 | \n",
" 4 | \n",
" $5,000.00 | \n",
"
\n",
" \n",
" | 2 | \n",
" Toyota | \n",
" Blue | \n",
" 32549 | \n",
" 3 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 3 | \n",
" BMW | \n",
" Black | \n",
" 11179 | \n",
" 5 | \n",
" $22,000.00 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" White | \n",
" 213095 | \n",
" 4 | \n",
" $3,500.00 | \n",
"
\n",
" \n",
" | 5 | \n",
" Toyota | \n",
" Green | \n",
" 99213 | \n",
" 4 | \n",
" $4,500.00 | \n",
"
\n",
" \n",
" | 6 | \n",
" Honda | \n",
" Blue | \n",
" 45698 | \n",
" 4 | \n",
" $7,500.00 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" Blue | \n",
" 54738 | \n",
" 4 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 8 | \n",
" Toyota | \n",
" White | \n",
" 60000 | \n",
" 4 | \n",
" $6,250.00 | \n",
"
\n",
" \n",
" | 9 | \n",
" Nissan | \n",
" White | \n",
" 31600 | \n",
" 4 | \n",
" $9,700.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['Toyota', 'Honda', 'BMW', 'Nissan'], dtype=object)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Make.unique()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Toyota 4\n",
"Honda 3\n",
"Nissan 2\n",
"BMW 1\n",
"Name: Make, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Make.value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Make\" data"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(393,\n",
" ['Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota',\n",
" 'Toyota'])"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create fake \"Make\" data\n",
"\n",
"toyota = [\"Toyota\" for i in range(0, 393)]\n",
"len(toyota), toyota[:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"(304,\n",
" ['Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda',\n",
" 'Honda'])"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"honda = [\"Honda\" for i in range(0, 304)]\n",
"len(honda), honda[:10]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(198,\n",
" ['Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan',\n",
" 'Nissan'])"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nissan = [\"Nissan\" for i in range(0, 198)]\n",
"len(nissan), nissan[:10]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(100, ['BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW', 'BMW'])"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"bmw = [\"BMW\" for i in range(0, 100)]\n",
"len(bmw), bmw[:10]"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"makes = bmw+nissan+toyota+honda\n",
"len(makes)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Colour\" data"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array(['White', 'Red', 'Blue', 'Black', 'Green'], dtype=object)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Colour.unique()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"White 4\n",
"Blue 3\n",
"Green 1\n",
"Black 1\n",
"Red 1\n",
"Name: Colour, dtype: int64"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales.Colour.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(407, ['White', 'White', 'White'])"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"white = [\"White\" for i in range(0, 407)]\n",
"len(white), white[:3]"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(321, ['Blue', 'Blue', 'Blue'])"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"blue = [\"Blue\" for i in range(0, 321)]\n",
"len(blue), blue[:3]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(79, ['Green', 'Green', 'Green'])"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"green = [\"Green\" for i in range(0, 79)]\n",
"len(green), green[:3]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(99, ['Black', 'Black', 'Black'])"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"black = [\"Black\" for i in range(0, 99)]\n",
"len(black), black[:3]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(94, ['Red', 'Red', 'Red'])"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"red = [\"Red\" for i in range(0, 94)]\n",
"len(red), red[:3]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"colours = white+blue+green+black+red\n",
"len(colours)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" ['White',\n",
" 'White',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'White',\n",
" 'Blue',\n",
" 'Blue',\n",
" 'Red',\n",
" 'White'])"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import random\n",
"colours_shuffled = random.sample(colours, len(colours))\n",
"len(colours_shuffled), colours_shuffled[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake Odometer (KM) data"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Toyota | \n",
" White | \n",
" 150043 | \n",
" 4 | \n",
" $4,000.00 | \n",
"
\n",
" \n",
" | 1 | \n",
" Honda | \n",
" Red | \n",
" 87899 | \n",
" 4 | \n",
" $5,000.00 | \n",
"
\n",
" \n",
" | 2 | \n",
" Toyota | \n",
" Blue | \n",
" 32549 | \n",
" 3 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 3 | \n",
" BMW | \n",
" Black | \n",
" 11179 | \n",
" 5 | \n",
" $22,000.00 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" White | \n",
" 213095 | \n",
" 4 | \n",
" $3,500.00 | \n",
"
\n",
" \n",
" | 5 | \n",
" Toyota | \n",
" Green | \n",
" 99213 | \n",
" 4 | \n",
" $4,500.00 | \n",
"
\n",
" \n",
" | 6 | \n",
" Honda | \n",
" Blue | \n",
" 45698 | \n",
" 4 | \n",
" $7,500.00 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" Blue | \n",
" 54738 | \n",
" 4 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 8 | \n",
" Toyota | \n",
" White | \n",
" 60000 | \n",
" 4 | \n",
" $6,250.00 | \n",
"
\n",
" \n",
" | 9 | \n",
" Nissan | \n",
" White | \n",
" 31600 | \n",
" 4 | \n",
" $9,700.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" [195419,\n",
" 69066,\n",
" 209466,\n",
" 79301,\n",
" 134103,\n",
" 143651,\n",
" 245427,\n",
" 244095,\n",
" 176660,\n",
" 194189])"
]
},
"execution_count": 64,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"odometer = [random.randint(9789, 250000) for i in range(0, 1000)]\n",
"len(odometer), odometer[:10]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Doors\" data"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"five_doors = [5 for i in range(0, 79)]\n",
"three_doors = [3 for i in range(0, 65)]\n",
"four_doors = [4 for i in range(0, 856)]\n",
"doors = five_doors + three_doors + four_doors\n",
"doors_shuffled = random.sample(doors, len(doors))"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 3,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 5,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 5,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 3,\n",
" 4,\n",
" 4,\n",
" 4,\n",
" 4]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"doors_shuffled"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create fake \"Price\" data"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Toyota 398\n",
"Honda 304\n",
"Nissan 198\n",
"BMW 100\n",
"dtype: int64"
]
},
"execution_count": 68,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"makes_series = pd.Series(makes)\n",
"makes_series.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Toyota | \n",
" White | \n",
" 150043 | \n",
" 4 | \n",
" $4,000.00 | \n",
"
\n",
" \n",
" | 1 | \n",
" Honda | \n",
" Red | \n",
" 87899 | \n",
" 4 | \n",
" $5,000.00 | \n",
"
\n",
" \n",
" | 2 | \n",
" Toyota | \n",
" Blue | \n",
" 32549 | \n",
" 3 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 3 | \n",
" BMW | \n",
" Black | \n",
" 11179 | \n",
" 5 | \n",
" $22,000.00 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" White | \n",
" 213095 | \n",
" 4 | \n",
" $3,500.00 | \n",
"
\n",
" \n",
" | 5 | \n",
" Toyota | \n",
" Green | \n",
" 99213 | \n",
" 4 | \n",
" $4,500.00 | \n",
"
\n",
" \n",
" | 6 | \n",
" Honda | \n",
" Blue | \n",
" 45698 | \n",
" 4 | \n",
" $7,500.00 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" Blue | \n",
" 54738 | \n",
" 4 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 8 | \n",
" Toyota | \n",
" White | \n",
" 60000 | \n",
" 4 | \n",
" $6,250.00 | \n",
"
\n",
" \n",
" | 9 | \n",
" Nissan | \n",
" White | \n",
" 31600 | \n",
" 4 | \n",
" $9,700.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"1 Honda Red 87899 4 $5,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"3 BMW Black 11179 5 $22,000.00\n",
"4 Nissan White 213095 4 $3,500.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00\n",
"8 Toyota White 60000 4 $6,250.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Toyota | \n",
" White | \n",
" 150043 | \n",
" 4 | \n",
" $4,000.00 | \n",
"
\n",
" \n",
" | 2 | \n",
" Toyota | \n",
" Blue | \n",
" 32549 | \n",
" 3 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
" | 5 | \n",
" Toyota | \n",
" Green | \n",
" 99213 | \n",
" 4 | \n",
" $4,500.00 | \n",
"
\n",
" \n",
" | 8 | \n",
" Toyota | \n",
" White | \n",
" 60000 | \n",
" 4 | \n",
" $6,250.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Toyota White 150043 4 $4,000.00\n",
"2 Toyota Blue 32549 3 $7,000.00\n",
"5 Toyota Green 99213 4 $4,500.00\n",
"8 Toyota White 60000 4 $6,250.00"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Toyota\"]"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 1 | \n",
" Honda | \n",
" Red | \n",
" 87899 | \n",
" 4 | \n",
" $5,000.00 | \n",
"
\n",
" \n",
" | 6 | \n",
" Honda | \n",
" Blue | \n",
" 45698 | \n",
" 4 | \n",
" $7,500.00 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" Blue | \n",
" 54738 | \n",
" 4 | \n",
" $7,000.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"1 Honda Red 87899 4 $5,000.00\n",
"6 Honda Blue 45698 4 $7,500.00\n",
"7 Honda Blue 54738 4 $7,000.00"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Honda\"]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 4 | \n",
" Nissan | \n",
" White | \n",
" 213095 | \n",
" 4 | \n",
" $3,500.00 | \n",
"
\n",
" \n",
" | 9 | \n",
" Nissan | \n",
" White | \n",
" 31600 | \n",
" 4 | \n",
" $9,700.00 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"4 Nissan White 213095 4 $3,500.00\n",
"9 Nissan White 31600 4 $9,700.00"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"car_sales[car_sales[\"Make\"] == \"Nissan\"]"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000,\n",
" [27185,\n",
" 8815,\n",
" 23614,\n",
" 19783,\n",
" 29208,\n",
" 17251,\n",
" 10138,\n",
" 27640,\n",
" 7332,\n",
" 9946,\n",
" 29670,\n",
" 12779,\n",
" 26735,\n",
" 21481,\n",
" 9313,\n",
" 13094,\n",
" 17684,\n",
" 21389,\n",
" 5239,\n",
" 16733,\n",
" 19670,\n",
" 10542,\n",
" 11122,\n",
" 21311,\n",
" 29545,\n",
" 20601,\n",
" 22714,\n",
" 28876,\n",
" 14063,\n",
" 5491])"
]
},
"execution_count": 119,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"prices = [random.randint(5000, 30000) for i in range(0, 1000)]\n",
"len(prices), prices[:30]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Create base dataframe with manufactured data"
]
},
{
"cell_type": "code",
"execution_count": 136,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [Make, Colour, Odometer (KM), Doors, Price]\n",
"Index: []"
]
},
"execution_count": 136,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales = pd.DataFrame(columns = [\"Make\", \"Colour\", \"Odometer (KM)\", \"Doors\", \"Price\"])\n",
"fake_sales"
]
},
{
"cell_type": "code",
"execution_count": 137,
"metadata": {},
"outputs": [],
"source": [
"fake_sales[\"Make\"] = makes\n",
"fake_sales[\"Colour\"] = colours_shuffled\n",
"fake_sales[\"Odometer (KM)\"] = odometer\n",
"fake_sales[\"Doors\"] = doors\n",
"fake_sales[\"Price\"] = prices"
]
},
{
"cell_type": "code",
"execution_count": 138,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" BMW | \n",
" White | \n",
" 195419 | \n",
" 5 | \n",
" 27185 | \n",
"
\n",
" \n",
" | 1 | \n",
" BMW | \n",
" White | \n",
" 69066 | \n",
" 5 | \n",
" 8815 | \n",
"
\n",
" \n",
" | 2 | \n",
" BMW | \n",
" Blue | \n",
" 209466 | \n",
" 5 | \n",
" 23614 | \n",
"
\n",
" \n",
" | 3 | \n",
" BMW | \n",
" Blue | \n",
" 79301 | \n",
" 5 | \n",
" 19783 | \n",
"
\n",
" \n",
" | 4 | \n",
" BMW | \n",
" Blue | \n",
" 134103 | \n",
" 5 | \n",
" 29208 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 BMW White 195419 5 27185\n",
"1 BMW White 69066 5 8815\n",
"2 BMW Blue 209466 5 23614\n",
"3 BMW Blue 79301 5 19783\n",
"4 BMW Blue 134103 5 29208"
]
},
"execution_count": 138,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Adjust the price column\n",
"\n",
"For the price column:\n",
"* Generate random numbers between the certain values\n",
"* If the Odometer reading is above 100K, multiply price by 0.75\n",
"* If the Odometer reading is above 150K, multiply price by 0.6\n",
"* If the Odometer reading is above 200K, multiply price by 0.5\n",
"* If the Make column is BMW, multiply price by 1.5 + 2500\n",
"* If the Make column is Toyota, multuply price by 1.2\n",
"* If the Make is Nissan, multiply price by 1.1\n",
"* If the Make is Honda, add $1000 to price"
]
},
{
"cell_type": "code",
"execution_count": 139,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 17369.943000\n",
"std 7260.398755\n",
"min 5005.000000\n",
"25% 11039.500000\n",
"50% 17427.500000\n",
"75% 23353.500000\n",
"max 29990.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 139,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 140,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 13151.713000\n",
"std 6722.177036\n",
"min 2509.000000\n",
"25% 7854.750000\n",
"50% 12016.000000\n",
"75% 17082.250000\n",
"max 29990.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 140,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def price_od(price, odometer):\n",
" \"\"\"\n",
" Changes price according to Odometer values.\n",
" \"\"\"\n",
" if 100000 <= odometer <= 150000:\n",
" return round(price * 0.75)\n",
" elif 150001 <= odometer <= 200000:\n",
" return round(price * 0.6)\n",
" elif 200001 <= odometer:\n",
" return round(price * 0.5)\n",
" else:\n",
" return price\n",
"\n",
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_od(x[\"Price\"], \n",
" x[\"Odometer (KM)\"]), \n",
" axis=1)\n",
"\n",
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 141,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1000.000000\n",
"mean 16045.665000\n",
"std 8630.794219\n",
"min 2796.000000\n",
"25% 9481.500000\n",
"50% 14264.000000\n",
"75% 20738.750000\n",
"max 52458.000000\n",
"Name: Price, dtype: float64"
]
},
"execution_count": 141,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"def price_make(price, make):\n",
" \"\"\"\n",
" Manipulates the price base on the cars make.\n",
" \"\"\"\n",
" if make == \"BMW\":\n",
" return round((price * 1.5) + random.randint(3000, 10000))\n",
" elif make == \"Toyota\":\n",
" return round(price * 1.2)\n",
" elif make == \"Nissan\":\n",
" return round(price * 1.1)\n",
" elif make == \"Honda\":\n",
" return round(price + 1000)\n",
" else:\n",
" return price\n",
"\n",
"fake_sales[\"Price\"] = fake_sales.apply(lambda x: price_make(x[\"Price\"], \n",
" x[\"Make\"]), \n",
" axis=1)\n",
"\n",
"fake_sales[\"Price\"].describe()"
]
},
{
"cell_type": "code",
"execution_count": 142,
"metadata": {},
"outputs": [],
"source": [
"fake_sales = fake_sales.sample(frac=1)"
]
},
{
"cell_type": "code",
"execution_count": 143,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Honda | \n",
" White | \n",
" 35431 | \n",
" 4 | \n",
" 15323 | \n",
"
\n",
" \n",
" | 1 | \n",
" BMW | \n",
" Blue | \n",
" 192714 | \n",
" 5 | \n",
" 19943 | \n",
"
\n",
" \n",
" | 2 | \n",
" Honda | \n",
" White | \n",
" 84714 | \n",
" 4 | \n",
" 28343 | \n",
"
\n",
" \n",
" | 3 | \n",
" Toyota | \n",
" White | \n",
" 154365 | \n",
" 4 | \n",
" 13434 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" Blue | \n",
" 181577 | \n",
" 3 | \n",
" 14043 | \n",
"
\n",
" \n",
" | 5 | \n",
" Honda | \n",
" Red | \n",
" 42652 | \n",
" 4 | \n",
" 23883 | \n",
"
\n",
" \n",
" | 6 | \n",
" Toyota | \n",
" Blue | \n",
" 163453 | \n",
" 4 | \n",
" 8473 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" White | \n",
" 43120 | \n",
" 4 | \n",
" 20306 | \n",
"
\n",
" \n",
" | 8 | \n",
" Nissan | \n",
" White | \n",
" 130538 | \n",
" 4 | \n",
" 9374 | \n",
"
\n",
" \n",
" | 9 | \n",
" Honda | \n",
" Blue | \n",
" 51029 | \n",
" 4 | \n",
" 26683 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431 4 15323\n",
"1 BMW Blue 192714 5 19943\n",
"2 Honda White 84714 4 28343\n",
"3 Toyota White 154365 4 13434\n",
"4 Nissan Blue 181577 3 14043\n",
"5 Honda Red 42652 4 23883\n",
"6 Toyota Blue 163453 4 8473\n",
"7 Honda White 43120 4 20306\n",
"8 Nissan White 130538 4 9374\n",
"9 Honda Blue 51029 4 26683"
]
},
"execution_count": 143,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fake_sales.reset_index(drop=True, inplace=True)\n",
"fake_sales.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NEXT:\n",
"* Drop some values at random (to manufacture missing data)\n",
"* Build a random forest model to predict (this will involve changing categories to numerical data)"
]
},
{
"cell_type": "code",
"execution_count": 146,
"metadata": {},
"outputs": [],
"source": [
"# Export the data\n",
"fake_sales.to_csv(\"../data/car-sales-extended.csv\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make missing data in car_sales_extended"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"sales_ext = pd.read_csv(\"../data/car-sales-extended.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1000"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(sales_ext)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Honda | \n",
" White | \n",
" 35431 | \n",
" 4 | \n",
" 15323 | \n",
"
\n",
" \n",
" | 1 | \n",
" BMW | \n",
" Blue | \n",
" 192714 | \n",
" 5 | \n",
" 19943 | \n",
"
\n",
" \n",
" | 2 | \n",
" Honda | \n",
" White | \n",
" 84714 | \n",
" 4 | \n",
" 28343 | \n",
"
\n",
" \n",
" | 3 | \n",
" Toyota | \n",
" White | \n",
" 154365 | \n",
" 4 | \n",
" 13434 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" Blue | \n",
" 181577 | \n",
" 3 | \n",
" 14043 | \n",
"
\n",
" \n",
" | ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" | 995 | \n",
" Toyota | \n",
" Black | \n",
" 35820 | \n",
" 4 | \n",
" 32042 | \n",
"
\n",
" \n",
" | 996 | \n",
" Nissan | \n",
" White | \n",
" 155144 | \n",
" 3 | \n",
" 5716 | \n",
"
\n",
" \n",
" | 997 | \n",
" Nissan | \n",
" Blue | \n",
" 66604 | \n",
" 4 | \n",
" 31570 | \n",
"
\n",
" \n",
" | 998 | \n",
" Honda | \n",
" White | \n",
" 215883 | \n",
" 4 | \n",
" 4001 | \n",
"
\n",
" \n",
" | 999 | \n",
" Toyota | \n",
" Blue | \n",
" 248360 | \n",
" 4 | \n",
" 12732 | \n",
"
\n",
" \n",
"
\n",
"
1000 rows × 5 columns
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431 4 15323\n",
"1 BMW Blue 192714 5 19943\n",
"2 Honda White 84714 4 28343\n",
"3 Toyota White 154365 4 13434\n",
"4 Nissan Blue 181577 3 14043\n",
".. ... ... ... ... ...\n",
"995 Toyota Black 35820 4 32042\n",
"996 Nissan White 155144 3 5716\n",
"997 Nissan Blue 66604 4 31570\n",
"998 Honda White 215883 4 4001\n",
"999 Toyota Blue 248360 4 12732\n",
"\n",
"[1000 rows x 5 columns]"
]
},
"execution_count": 63,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### What we want to do\n",
"* Remove some rows values or replace them at random\n",
" * E.g. replace strings with empty strings (\"\")\n",
" * And numbers with NaN or something similar...\n",
"* Want to keep the number of samples the same, order the same, just put some holes in it\n",
"\n",
"One way to do it would be to generate 50 random integers for each column and then drop/replace the indicies."
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"# Replicate the df\n",
"sales_ext_dropped = sales_ext"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
"# Make column\n",
"np.random.seed(10)\n",
"make_idx = np.random.randint(0, 1000, 50)"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([265, 125, 996, 527, 320, 369, 123, 156, 985, 733, 496, 925, 881,\n",
" 8, 73, 256, 490, 40, 502, 420, 371, 528, 356, 239, 395, 54,\n",
" 344, 363, 122, 574, 545, 200, 868, 974, 689, 691, 54, 77, 453,\n",
" 13, 755, 409, 382, 653, 860, 342, 798, 670, 89, 652])"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"make_idx"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
"for value in make_idx:\n",
" sales_ext_dropped.loc[value, \"Make\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Honda'"
]
},
"execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext_dropped[\"Make\"][266]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"# Colour column\n",
"np.random.seed(42)\n",
"colour_idx = np.random.randint(0, 1000, 50)\n",
"for value in colour_idx:\n",
" sales_ext_dropped.loc[value, \"Colour\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"# Odometer (KM) column\n",
"np.random.seed(1)\n",
"odom_idx = np.random.randint(0, 1000, 50)\n",
"for value in odom_idx:\n",
" sales_ext_dropped.loc[value, \"Odometer (KM)\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"# Doors column\n",
"np.random.seed(2)\n",
"door_idx = np.random.randint(0, 1000, 50)\n",
"for value in door_idx:\n",
" sales_ext_dropped.loc[value, \"Doors\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"# Price column\n",
"np.random.seed(3)\n",
"price_idx = np.random.randint(0, 1000, 50)\n",
"for value in price_idx:\n",
" sales_ext_dropped.loc[value, \"Price\"] = None"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Make | \n",
" Colour | \n",
" Odometer (KM) | \n",
" Doors | \n",
" Price | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" Honda | \n",
" White | \n",
" 35431.0 | \n",
" 4.0 | \n",
" 15323.0 | \n",
"
\n",
" \n",
" | 1 | \n",
" BMW | \n",
" Blue | \n",
" 192714.0 | \n",
" 5.0 | \n",
" 19943.0 | \n",
"
\n",
" \n",
" | 2 | \n",
" Honda | \n",
" White | \n",
" 84714.0 | \n",
" 4.0 | \n",
" 28343.0 | \n",
"
\n",
" \n",
" | 3 | \n",
" Toyota | \n",
" White | \n",
" 154365.0 | \n",
" 4.0 | \n",
" 13434.0 | \n",
"
\n",
" \n",
" | 4 | \n",
" Nissan | \n",
" Blue | \n",
" 181577.0 | \n",
" 3.0 | \n",
" 14043.0 | \n",
"
\n",
" \n",
" | 5 | \n",
" Honda | \n",
" Red | \n",
" 42652.0 | \n",
" 4.0 | \n",
" 23883.0 | \n",
"
\n",
" \n",
" | 6 | \n",
" Toyota | \n",
" Blue | \n",
" 163453.0 | \n",
" 4.0 | \n",
" 8473.0 | \n",
"
\n",
" \n",
" | 7 | \n",
" Honda | \n",
" White | \n",
" NaN | \n",
" 4.0 | \n",
" 20306.0 | \n",
"
\n",
" \n",
" | 8 | \n",
" | \n",
" White | \n",
" 130538.0 | \n",
" 4.0 | \n",
" 9374.0 | \n",
"
\n",
" \n",
" | 9 | \n",
" Honda | \n",
" Blue | \n",
" 51029.0 | \n",
" 4.0 | \n",
" 26683.0 | \n",
"
\n",
" \n",
" | 10 | \n",
" Nissan | \n",
" White | \n",
" 167421.0 | \n",
" 4.0 | \n",
" 16259.0 | \n",
"
\n",
" \n",
" | 11 | \n",
" Nissan | \n",
" Green | \n",
" 17119.0 | \n",
" 4.0 | \n",
" 6160.0 | \n",
"
\n",
" \n",
" | 12 | \n",
" Nissan | \n",
" White | \n",
" 102303.0 | \n",
" 4.0 | \n",
" 16909.0 | \n",
"
\n",
" \n",
" | 13 | \n",
" | \n",
" White | \n",
" 134181.0 | \n",
" 4.0 | \n",
" 11121.0 | \n",
"
\n",
" \n",
" | 14 | \n",
" Honda | \n",
" Blue | \n",
" 199833.0 | \n",
" 4.0 | \n",
" 18946.0 | \n",
"
\n",
" \n",
" | 15 | \n",
" Toyota | \n",
" Blue | \n",
" 205592.0 | \n",
" 4.0 | \n",
" 16290.0 | \n",
"
\n",
" \n",
" | 16 | \n",
" Toyota | \n",
" Red | \n",
" 96742.0 | \n",
" 4.0 | \n",
" 34465.0 | \n",
"
\n",
" \n",
" | 17 | \n",
" BMW | \n",
" White | \n",
" 194189.0 | \n",
" 5.0 | \n",
" 17177.0 | \n",
"
\n",
" \n",
" | 18 | \n",
" Nissan | \n",
" White | \n",
" 67991.0 | \n",
" 3.0 | \n",
" 9109.0 | \n",
"
\n",
" \n",
" | 19 | \n",
" Nissan | \n",
" Blue | \n",
" 215820.0 | \n",
" 4.0 | \n",
" 6010.0 | \n",
"
\n",
" \n",
" | 20 | \n",
" Toyota | \n",
" | \n",
" 124844.0 | \n",
" 4.0 | \n",
" 24130.0 | \n",
"
\n",
" \n",
" | 21 | \n",
" Honda | \n",
" | \n",
" 30615.0 | \n",
" 4.0 | \n",
" 29653.0 | \n",
"
\n",
" \n",
" | 22 | \n",
" Toyota | \n",
" White | \n",
" 148744.0 | \n",
" 4.0 | \n",
" 22489.0 | \n",
"
\n",
" \n",
" | 23 | \n",
" Honda | \n",
" Green | \n",
" 130075.0 | \n",
" 4.0 | \n",
" 21242.0 | \n",
"
\n",
" \n",
" | 24 | \n",
" Honda | \n",
" Blue | \n",
" 172718.0 | \n",
" 4.0 | \n",
" 14274.0 | \n",
"
\n",
" \n",
" | 25 | \n",
" Honda | \n",
" Blue | \n",
" 125819.0 | \n",
" 4.0 | \n",
" 15686.0 | \n",
"
\n",
" \n",
" | 26 | \n",
" Honda | \n",
" White | \n",
" 180390.0 | \n",
" 4.0 | \n",
" 13344.0 | \n",
"
\n",
" \n",
" | 27 | \n",
" Honda | \n",
" Green | \n",
" 82783.0 | \n",
" 4.0 | \n",
" 10984.0 | \n",
"
\n",
" \n",
" | 28 | \n",
" Honda | \n",
" White | \n",
" 56687.0 | \n",
" 4.0 | \n",
" 6135.0 | \n",
"
\n",
" \n",
" | 29 | \n",
" Toyota | \n",
" White | \n",
" 112004.0 | \n",
" 4.0 | \n",
" 13586.0 | \n",
"
\n",
" \n",
" | 30 | \n",
" Nissan | \n",
" Blue | \n",
" 34024.0 | \n",
" 4.0 | \n",
" 23929.0 | \n",
"
\n",
" \n",
" | 31 | \n",
" Toyota | \n",
" White | \n",
" 108569.0 | \n",
" NaN | \n",
" 6866.0 | \n",
"
\n",
" \n",
" | 32 | \n",
" Nissan | \n",
" White | \n",
" 203795.0 | \n",
" 4.0 | \n",
" 9330.0 | \n",
"
\n",
" \n",
" | 33 | \n",
" Nissan | \n",
" Green | \n",
" 153554.0 | \n",
" 3.0 | \n",
" 9780.0 | \n",
"
\n",
" \n",
" | 34 | \n",
" Honda | \n",
" Blue | \n",
" 71949.0 | \n",
" 4.0 | \n",
" 17516.0 | \n",
"
\n",
" \n",
" | 35 | \n",
" Honda | \n",
" Blue | \n",
" 191622.0 | \n",
" 4.0 | \n",
" 8738.0 | \n",
"
\n",
" \n",
" | 36 | \n",
" Nissan | \n",
" Blue | \n",
" 38186.0 | \n",
" 4.0 | \n",
" 23438.0 | \n",
"
\n",
" \n",
" | 37 | \n",
" Nissan | \n",
" White | \n",
" NaN | \n",
" 4.0 | \n",
" 15131.0 | \n",
"
\n",
" \n",
" | 38 | \n",
" Nissan | \n",
" Blue | \n",
" 146430.0 | \n",
" 3.0 | \n",
" 9224.0 | \n",
"
\n",
" \n",
" | 39 | \n",
" Toyota | \n",
" Green | \n",
" 109868.0 | \n",
" 4.0 | \n",
" 6574.0 | \n",
"
\n",
" \n",
" | 40 | \n",
" | \n",
" White | \n",
" 177894.0 | \n",
" 4.0 | \n",
" 9229.0 | \n",
"
\n",
" \n",
" | 41 | \n",
" Toyota | \n",
" Blue | \n",
" 189209.0 | \n",
" 4.0 | \n",
" 7100.0 | \n",
"
\n",
" \n",
" | 42 | \n",
" Honda | \n",
" Black | \n",
" 200490.0 | \n",
" 4.0 | \n",
" 6337.0 | \n",
"
\n",
" \n",
" | 43 | \n",
" Toyota | \n",
" Blue | \n",
" 141617.0 | \n",
" 4.0 | \n",
" 19085.0 | \n",
"
\n",
" \n",
" | 44 | \n",
" Toyota | \n",
" Blue | \n",
" 213893.0 | \n",
" 4.0 | \n",
" 5743.0 | \n",
"
\n",
" \n",
" | 45 | \n",
" Nissan | \n",
" Blue | \n",
" 231057.0 | \n",
" 4.0 | \n",
" 5925.0 | \n",
"
\n",
" \n",
" | 46 | \n",
" Toyota | \n",
" White | \n",
" 100938.0 | \n",
" 4.0 | \n",
" 25196.0 | \n",
"
\n",
" \n",
" | 47 | \n",
" Toyota | \n",
" Blue | \n",
" 243969.0 | \n",
" NaN | \n",
" 16138.0 | \n",
"
\n",
" \n",
" | 48 | \n",
" Nissan | \n",
" White | \n",
" 107096.0 | \n",
" 3.0 | \n",
" 6075.0 | \n",
"
\n",
" \n",
" | 49 | \n",
" Toyota | \n",
" Black | \n",
" 86333.0 | \n",
" 4.0 | \n",
" 9928.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Make Colour Odometer (KM) Doors Price\n",
"0 Honda White 35431.0 4.0 15323.0\n",
"1 BMW Blue 192714.0 5.0 19943.0\n",
"2 Honda White 84714.0 4.0 28343.0\n",
"3 Toyota White 154365.0 4.0 13434.0\n",
"4 Nissan Blue 181577.0 3.0 14043.0\n",
"5 Honda Red 42652.0 4.0 23883.0\n",
"6 Toyota Blue 163453.0 4.0 8473.0\n",
"7 Honda White NaN 4.0 20306.0\n",
"8 White 130538.0 4.0 9374.0\n",
"9 Honda Blue 51029.0 4.0 26683.0\n",
"10 Nissan White 167421.0 4.0 16259.0\n",
"11 Nissan Green 17119.0 4.0 6160.0\n",
"12 Nissan White 102303.0 4.0 16909.0\n",
"13 White 134181.0 4.0 11121.0\n",
"14 Honda Blue 199833.0 4.0 18946.0\n",
"15 Toyota Blue 205592.0 4.0 16290.0\n",
"16 Toyota Red 96742.0 4.0 34465.0\n",
"17 BMW White 194189.0 5.0 17177.0\n",
"18 Nissan White 67991.0 3.0 9109.0\n",
"19 Nissan Blue 215820.0 4.0 6010.0\n",
"20 Toyota 124844.0 4.0 24130.0\n",
"21 Honda 30615.0 4.0 29653.0\n",
"22 Toyota White 148744.0 4.0 22489.0\n",
"23 Honda Green 130075.0 4.0 21242.0\n",
"24 Honda Blue 172718.0 4.0 14274.0\n",
"25 Honda Blue 125819.0 4.0 15686.0\n",
"26 Honda White 180390.0 4.0 13344.0\n",
"27 Honda Green 82783.0 4.0 10984.0\n",
"28 Honda White 56687.0 4.0 6135.0\n",
"29 Toyota White 112004.0 4.0 13586.0\n",
"30 Nissan Blue 34024.0 4.0 23929.0\n",
"31 Toyota White 108569.0 NaN 6866.0\n",
"32 Nissan White 203795.0 4.0 9330.0\n",
"33 Nissan Green 153554.0 3.0 9780.0\n",
"34 Honda Blue 71949.0 4.0 17516.0\n",
"35 Honda Blue 191622.0 4.0 8738.0\n",
"36 Nissan Blue 38186.0 4.0 23438.0\n",
"37 Nissan White NaN 4.0 15131.0\n",
"38 Nissan Blue 146430.0 3.0 9224.0\n",
"39 Toyota Green 109868.0 4.0 6574.0\n",
"40 White 177894.0 4.0 9229.0\n",
"41 Toyota Blue 189209.0 4.0 7100.0\n",
"42 Honda Black 200490.0 4.0 6337.0\n",
"43 Toyota Blue 141617.0 4.0 19085.0\n",
"44 Toyota Blue 213893.0 4.0 5743.0\n",
"45 Nissan Blue 231057.0 4.0 5925.0\n",
"46 Toyota White 100938.0 4.0 25196.0\n",
"47 Toyota Blue 243969.0 NaN 16138.0\n",
"48 Nissan White 107096.0 3.0 6075.0\n",
"49 Toyota Black 86333.0 4.0 9928.0"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sales_ext_dropped.head(50)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Make 0\n",
"Colour 0\n",
"Odometer (KM) 50\n",
"Doors 50\n",
"Price 50\n",
"dtype: int64"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Check how many of our values are missing/NaN\n",
"sales_ext_dropped.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"# Export dataframe with random missing values\n",
"sales_ext_dropped.to_csv(\"../data/car-sales-extended-missing-data.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}