create frequency table

This commit is contained in:
Anton Wirsing 2025-08-21 17:01:44 +02:00
parent 1455466044
commit c0a083ae92

255
dataMining.ipynb Normal file
View File

@ -0,0 +1,255 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 156,
"id": "83c66bad-e9bb-4ea7-9f6b-be842df821f0",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import json\n",
"import re\n",
"import unicodedata\n",
"import numpy as np\n",
"data = pd.read_csv(\"./quotations.csv\", on_bad_lines='skip',sep=\";\")\n",
"\n",
"# Filter to just “Fensterreinigung” and drop rows missing requirements_textual\n",
"data = data.loc[\n",
" data['requirements_textual'].notna()\n",
" & data['price'].notna()\n",
"]\n",
"\n",
"\n",
"# (booking['product_name'] == \"Fensterreinigung\")\n",
"# & booking['quotation_info_request'].isna()\n"
]
},
{
"cell_type": "code",
"execution_count": 170,
"id": "d78b8f4a-f140-463f-8369-163f520dca4b",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/tmp/ipykernel_3060060/2985588999.py:27: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>freq</th>\n",
" <th>relFreq</th>\n",
" <th>name</th>\n",
" <th>cumRelFreq</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>5169</td>\n",
" <td>0.41</td>\n",
" <td>Umzugsreinigung</td>\n",
" <td>0.41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2448</td>\n",
" <td>0.19</td>\n",
" <td>Intensivreinigung</td>\n",
" <td>0.60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2124</td>\n",
" <td>0.17</td>\n",
" <td>Fensterreinigung</td>\n",
" <td>0.77</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>503</td>\n",
" <td>0.04</td>\n",
" <td>Poolreinigung</td>\n",
" <td>0.81</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>474</td>\n",
" <td>0.04</td>\n",
" <td>Wintergartenreinigung</td>\n",
" <td>0.85</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>456</td>\n",
" <td>0.04</td>\n",
" <td>Baureinigung</td>\n",
" <td>0.88</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>418</td>\n",
" <td>0.03</td>\n",
" <td>Polsterreinigung</td>\n",
" <td>0.91</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>158</td>\n",
" <td>0.01</td>\n",
" <td>Messie-Reinigung</td>\n",
" <td>0.93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>133</td>\n",
" <td>0.01</td>\n",
" <td>Frühjahrsputz</td>\n",
" <td>0.94</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>116</td>\n",
" <td>0.01</td>\n",
" <td>Treppenhaus-Reinigung</td>\n",
" <td>0.95</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>682</td>\n",
" <td>0.05</td>\n",
" <td>Sonstige</td>\n",
" <td>1.00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" freq relFreq name cumRelFreq\n",
"0 5169 0.41 Umzugsreinigung 0.41\n",
"1 2448 0.19 Intensivreinigung 0.60\n",
"2 2124 0.17 Fensterreinigung 0.77\n",
"3 503 0.04 Poolreinigung 0.81\n",
"4 474 0.04 Wintergartenreinigung 0.85\n",
"5 456 0.04 Baureinigung 0.88\n",
"6 418 0.03 Polsterreinigung 0.91\n",
"7 158 0.01 Messie-Reinigung 0.93\n",
"8 133 0.01 Frühjahrsputz 0.94\n",
"9 116 0.01 Treppenhaus-Reinigung 0.95\n",
"10 682 0.05 Sonstige 1.00"
]
},
"execution_count": 170,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Nur Spezialreinigungsanfragen\n",
"\n",
"data.keys()\n",
"\n",
"#Absolute und relative Häufigkeit von Reinigungsarten\n",
"countProduct = (lambda x: [ sum((data[\"product_name\"]==x)) \\\n",
" ,sum((data[\"product_name\"]==x)) / len(data[\"product_name\"]) \\\n",
" ,x ])\n",
"\n",
"products = sorted(list(map(countProduct,data[\"product_name\"].unique())),reverse=True)\n",
"products = pd.DataFrame(products, columns=[\"freq\",\"relFreq\",\"name\"])\n",
"#products = products.sort_values(by=\"freq\", ascending=False)\n",
"products\n",
"\n",
"for i in products.index:\n",
" if i>0:\n",
" products.at[i,\"cumRelFreq\"]=products.at[i-1,\"cumRelFreq\"] + products.at[i,\"relFreq\"]\n",
" else:\n",
" products.at[i,\"cumRelFreq\"]=products.at[i,\"relFreq\"]\n",
"\n",
"products[[\"cumRelFreq\",\"relFreq\"]] = round(products[[\"cumRelFreq\",\"relFreq\"]], 2)\n",
"\n",
"shortened = products.head(10)\n",
"othersFreq = sum(products[\"freq\"]) - sum(shortened[\"freq\"]) \n",
"othersRelFreq = round( othersFreq / sum(products[\"freq\"]) , 2)\n",
"othersCumRelFreq = 1\n",
"shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n",
"shortened\n",
"\n",
"#shortened[\"freq\"]\n",
"#pd.concat([[0,0,0,0],shortened])\n",
"#help(pd.concat)\n",
"#help(pd.DataFrame.sort_values)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4286aa50-1ef5-45a0-a90b-c65534e1ec3a",
"metadata": {},
"outputs": [],
"source": [
"#data\n",
"\n",
"booking = \n",
"#images = os.listdir(\"/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09\")\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)\n",
"#print(df.columns)\n",
"\n",
"\n",
"#df.to_csv(\"./windowQuotationsSample.csv\", index=False,sep=\";\")\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}