diff --git a/dataMining.ipynb b/dataMining.ipynb new file mode 100644 index 0000000..23a29bb --- /dev/null +++ b/dataMining.ipynb @@ -0,0 +1,255 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 156, + "id": "83c66bad-e9bb-4ea7-9f6b-be842df821f0", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import json\n", + "import re\n", + "import unicodedata\n", + "import numpy as np\n", + "data = pd.read_csv(\"./quotations.csv\", on_bad_lines='skip',sep=\";\")\n", + "\n", + "# Filter to just “Fensterreinigung” and drop rows missing requirements_textual\n", + "data = data.loc[\n", + " data['requirements_textual'].notna()\n", + " & data['price'].notna()\n", + "]\n", + "\n", + "\n", + "# (booking['product_name'] == \"Fensterreinigung\")\n", + "# & booking['quotation_info_request'].isna()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "id": "d78b8f4a-f140-463f-8369-163f520dca4b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_3060060/2985588999.py:27: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
freqrelFreqnamecumRelFreq
051690.41Umzugsreinigung0.41
124480.19Intensivreinigung0.60
221240.17Fensterreinigung0.77
35030.04Poolreinigung0.81
44740.04Wintergartenreinigung0.85
54560.04Baureinigung0.88
64180.03Polsterreinigung0.91
71580.01Messie-Reinigung0.93
81330.01Frühjahrsputz0.94
91160.01Treppenhaus-Reinigung0.95
106820.05Sonstige1.00
\n", + "
" + ], + "text/plain": [ + " freq relFreq name cumRelFreq\n", + "0 5169 0.41 Umzugsreinigung 0.41\n", + "1 2448 0.19 Intensivreinigung 0.60\n", + "2 2124 0.17 Fensterreinigung 0.77\n", + "3 503 0.04 Poolreinigung 0.81\n", + "4 474 0.04 Wintergartenreinigung 0.85\n", + "5 456 0.04 Baureinigung 0.88\n", + "6 418 0.03 Polsterreinigung 0.91\n", + "7 158 0.01 Messie-Reinigung 0.93\n", + "8 133 0.01 Frühjahrsputz 0.94\n", + "9 116 0.01 Treppenhaus-Reinigung 0.95\n", + "10 682 0.05 Sonstige 1.00" + ] + }, + "execution_count": 170, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Nur Spezialreinigungsanfragen\n", + "\n", + "data.keys()\n", + "\n", + "#Absolute und relative Häufigkeit von Reinigungsarten\n", + "countProduct = (lambda x: [ sum((data[\"product_name\"]==x)) \\\n", + " ,sum((data[\"product_name\"]==x)) / len(data[\"product_name\"]) \\\n", + " ,x ])\n", + "\n", + "products = sorted(list(map(countProduct,data[\"product_name\"].unique())),reverse=True)\n", + "products = pd.DataFrame(products, columns=[\"freq\",\"relFreq\",\"name\"])\n", + "#products = products.sort_values(by=\"freq\", ascending=False)\n", + "products\n", + "\n", + "for i in products.index:\n", + " if i>0:\n", + " products.at[i,\"cumRelFreq\"]=products.at[i-1,\"cumRelFreq\"] + products.at[i,\"relFreq\"]\n", + " else:\n", + " products.at[i,\"cumRelFreq\"]=products.at[i,\"relFreq\"]\n", + "\n", + "products[[\"cumRelFreq\",\"relFreq\"]] = round(products[[\"cumRelFreq\",\"relFreq\"]], 2)\n", + "\n", + "shortened = products.head(10)\n", + "othersFreq = sum(products[\"freq\"]) - sum(shortened[\"freq\"]) \n", + "othersRelFreq = round( othersFreq / sum(products[\"freq\"]) , 2)\n", + "othersCumRelFreq = 1\n", + "shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n", + "shortened\n", + "\n", + "#shortened[\"freq\"]\n", + "#pd.concat([[0,0,0,0],shortened])\n", + "#help(pd.concat)\n", + "#help(pd.DataFrame.sort_values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4286aa50-1ef5-45a0-a90b-c65534e1ec3a", + "metadata": {}, + "outputs": [], + "source": [ + "#data\n", + "\n", + "booking = \n", + "#images = os.listdir(\"/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09\")\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)\n", + "#print(df.columns)\n", + "\n", + "\n", + "#df.to_csv(\"./windowQuotationsSample.csv\", index=False,sep=\";\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}