{ "cells": [ { "cell_type": "code", "execution_count": 156, "id": "83c66bad-e9bb-4ea7-9f6b-be842df821f0", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import re\n", "import unicodedata\n", "import numpy as np\n", "data = pd.read_csv(\"./quotations.csv\", on_bad_lines='skip',sep=\";\")\n", "\n", "# Filter to just “Fensterreinigung” and drop rows missing requirements_textual\n", "data = data.loc[\n", " data['requirements_textual'].notna()\n", " & data['price'].notna()\n", "]\n", "\n", "\n", "# (booking['product_name'] == \"Fensterreinigung\")\n", "# & booking['quotation_info_request'].isna()\n" ] }, { "cell_type": "code", "execution_count": 170, "id": "d78b8f4a-f140-463f-8369-163f520dca4b", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_3060060/2985588999.py:27: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freqrelFreqnamecumRelFreq
051690.41Umzugsreinigung0.41
124480.19Intensivreinigung0.60
221240.17Fensterreinigung0.77
35030.04Poolreinigung0.81
44740.04Wintergartenreinigung0.85
54560.04Baureinigung0.88
64180.03Polsterreinigung0.91
71580.01Messie-Reinigung0.93
81330.01Frühjahrsputz0.94
91160.01Treppenhaus-Reinigung0.95
106820.05Sonstige1.00
\n", "
" ], "text/plain": [ " freq relFreq name cumRelFreq\n", "0 5169 0.41 Umzugsreinigung 0.41\n", "1 2448 0.19 Intensivreinigung 0.60\n", "2 2124 0.17 Fensterreinigung 0.77\n", "3 503 0.04 Poolreinigung 0.81\n", "4 474 0.04 Wintergartenreinigung 0.85\n", "5 456 0.04 Baureinigung 0.88\n", "6 418 0.03 Polsterreinigung 0.91\n", "7 158 0.01 Messie-Reinigung 0.93\n", "8 133 0.01 Frühjahrsputz 0.94\n", "9 116 0.01 Treppenhaus-Reinigung 0.95\n", "10 682 0.05 Sonstige 1.00" ] }, "execution_count": 170, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#Nur Spezialreinigungsanfragen\n", "\n", "data.keys()\n", "\n", "#Absolute und relative Häufigkeit von Reinigungsarten\n", "countProduct = (lambda x: [ sum((data[\"product_name\"]==x)) \\\n", " ,sum((data[\"product_name\"]==x)) / len(data[\"product_name\"]) \\\n", " ,x ])\n", "\n", "products = sorted(list(map(countProduct,data[\"product_name\"].unique())),reverse=True)\n", "products = pd.DataFrame(products, columns=[\"freq\",\"relFreq\",\"name\"])\n", "#products = products.sort_values(by=\"freq\", ascending=False)\n", "products\n", "\n", "for i in products.index:\n", " if i>0:\n", " products.at[i,\"cumRelFreq\"]=products.at[i-1,\"cumRelFreq\"] + products.at[i,\"relFreq\"]\n", " else:\n", " products.at[i,\"cumRelFreq\"]=products.at[i,\"relFreq\"]\n", "\n", "products[[\"cumRelFreq\",\"relFreq\"]] = round(products[[\"cumRelFreq\",\"relFreq\"]], 2)\n", "\n", "shortened = products.head(10)\n", "othersFreq = sum(products[\"freq\"]) - sum(shortened[\"freq\"]) \n", "othersRelFreq = round( othersFreq / sum(products[\"freq\"]) , 2)\n", "othersCumRelFreq = 1\n", "shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n", "shortened\n", "\n", "#shortened[\"freq\"]\n", "#pd.concat([[0,0,0,0],shortened])\n", "#help(pd.concat)\n", "#help(pd.DataFrame.sort_values)" ] }, { "cell_type": "code", "execution_count": null, "id": "4286aa50-1ef5-45a0-a90b-c65534e1ec3a", "metadata": {}, "outputs": [], "source": [ "#data\n", "\n", "booking = \n", "#images = os.listdir(\"/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09\")\n", "\n", "\n", "\n", "\n", "\n", "df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)\n", "#print(df.columns)\n", "\n", "\n", "#df.to_csv(\"./windowQuotationsSample.csv\", index=False,sep=\";\")\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }