create frequency table

2025-08-21 17:01:44 +02:00 · 2025-08-21 17:01:44 +02:00 · c0a083ae92
commit c0a083ae92
parent 1455466044
1 changed files with 255 additions and 0 deletions
--- a/dataMining.ipynb
+++ b/dataMining.ipynb
@ -0,0 +1,255 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 156,
   "id": "83c66bad-e9bb-4ea7-9f6b-be842df821f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import re\n",
    "import unicodedata\n",
    "import numpy as np\n",
    "data = pd.read_csv(\"./quotations.csv\", on_bad_lines='skip',sep=\";\")\n",
    "\n",
    "# Filter to just “Fensterreinigung” and drop rows missing requirements_textual\n",
    "data = data.loc[\n",
    "     data['requirements_textual'].notna()\n",
    "    & data['price'].notna()\n",
    "]\n",
    "\n",
    "\n",
    "#    (booking['product_name'] == \"Fensterreinigung\")\n",
    "#    & booking['quotation_info_request'].isna()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "id": "d78b8f4a-f140-463f-8369-163f520dca4b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/tmp/ipykernel_3060060/2985588999.py:27: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>freq</th>\n",
       "      <th>relFreq</th>\n",
       "      <th>name</th>\n",
       "      <th>cumRelFreq</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>5169</td>\n",
       "      <td>0.41</td>\n",
       "      <td>Umzugsreinigung</td>\n",
       "      <td>0.41</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2448</td>\n",
       "      <td>0.19</td>\n",
       "      <td>Intensivreinigung</td>\n",
       "      <td>0.60</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2124</td>\n",
       "      <td>0.17</td>\n",
       "      <td>Fensterreinigung</td>\n",
       "      <td>0.77</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>503</td>\n",
       "      <td>0.04</td>\n",
       "      <td>Poolreinigung</td>\n",
       "      <td>0.81</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>474</td>\n",
       "      <td>0.04</td>\n",
       "      <td>Wintergartenreinigung</td>\n",
       "      <td>0.85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>456</td>\n",
       "      <td>0.04</td>\n",
       "      <td>Baureinigung</td>\n",
       "      <td>0.88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>418</td>\n",
       "      <td>0.03</td>\n",
       "      <td>Polsterreinigung</td>\n",
       "      <td>0.91</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>158</td>\n",
       "      <td>0.01</td>\n",
       "      <td>Messie-Reinigung</td>\n",
       "      <td>0.93</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>133</td>\n",
       "      <td>0.01</td>\n",
       "      <td>Frühjahrsputz</td>\n",
       "      <td>0.94</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>116</td>\n",
       "      <td>0.01</td>\n",
       "      <td>Treppenhaus-Reinigung</td>\n",
       "      <td>0.95</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>682</td>\n",
       "      <td>0.05</td>\n",
       "      <td>Sonstige</td>\n",
       "      <td>1.00</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    freq  relFreq                   name  cumRelFreq\n",
       "0   5169     0.41        Umzugsreinigung        0.41\n",
       "1   2448     0.19      Intensivreinigung        0.60\n",
       "2   2124     0.17       Fensterreinigung        0.77\n",
       "3    503     0.04          Poolreinigung        0.81\n",
       "4    474     0.04  Wintergartenreinigung        0.85\n",
       "5    456     0.04           Baureinigung        0.88\n",
       "6    418     0.03       Polsterreinigung        0.91\n",
       "7    158     0.01       Messie-Reinigung        0.93\n",
       "8    133     0.01          Frühjahrsputz        0.94\n",
       "9    116     0.01  Treppenhaus-Reinigung        0.95\n",
       "10   682     0.05               Sonstige        1.00"
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#Nur Spezialreinigungsanfragen\n",
    "\n",
    "data.keys()\n",
    "\n",
    "#Absolute und relative Häufigkeit von Reinigungsarten\n",
    "countProduct = (lambda x: [ sum((data[\"product_name\"]==x)) \\\n",
    "                           ,sum((data[\"product_name\"]==x)) / len(data[\"product_name\"]) \\\n",
    "                            ,x ])\n",
    "\n",
    "products = sorted(list(map(countProduct,data[\"product_name\"].unique())),reverse=True)\n",
    "products = pd.DataFrame(products, columns=[\"freq\",\"relFreq\",\"name\"])\n",
    "#products = products.sort_values(by=\"freq\", ascending=False)\n",
    "products\n",
    "\n",
    "for i in products.index:\n",
    "    if i>0:\n",
    "        products.at[i,\"cumRelFreq\"]=products.at[i-1,\"cumRelFreq\"] + products.at[i,\"relFreq\"]\n",
    "    else:\n",
    "        products.at[i,\"cumRelFreq\"]=products.at[i,\"relFreq\"]\n",
    "\n",
    "products[[\"cumRelFreq\",\"relFreq\"]] = round(products[[\"cumRelFreq\",\"relFreq\"]], 2)\n",
    "\n",
    "shortened = products.head(10)\n",
    "othersFreq =  sum(products[\"freq\"]) - sum(shortened[\"freq\"])  \n",
    "othersRelFreq = round( othersFreq / sum(products[\"freq\"])  ,  2)\n",
    "othersCumRelFreq = 1\n",
    "shortened.loc[len(shortened)] = [othersFreq,othersRelFreq,\"Sonstige\",othersCumRelFreq]\n",
    "shortened\n",
    "\n",
    "#shortened[\"freq\"]\n",
    "#pd.concat([[0,0,0,0],shortened])\n",
    "#help(pd.concat)\n",
    "#help(pd.DataFrame.sort_values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4286aa50-1ef5-45a0-a90b-c65534e1ec3a",
   "metadata": {},
   "outputs": [],
   "source": [
    "#data\n",
    "\n",
    "booking = \n",
    "#images = os.listdir(\"/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09\")\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "\n",
    "df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)\n",
    "#print(df.columns)\n",
    "\n",
    "\n",
    "#df.to_csv(\"./windowQuotationsSample.csv\", index=False,sep=\";\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }