{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "ce5cf325-1b49-462b-b5cc-2564a3ae356b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " duration ZusatzInfo complete \\\n", "0 25 Bitte geben Sie an, ob zusätzlich Fensterreini... 0 \n", "\n", " confidence missverständliche Aspekte \\\n", "0 0.3 Nicht spezifiziert, ob die 50 qm die gesamte R... \n", "\n", " Zu erbringende Leistungen: \\\n", "0 Reinigung aller Räume (Boden, Oberflächen), Gr... \n", "\n", " Rechenweg \\\n", "0 Annahme 5 Minuten pro 10 qm → 50 qm / 10 = 5 E... \n", "\n", " Kommentare \n", "0 Beschreibung unvollständig; weitere Angaben zu... \n" ] } ], "source": [ "import pandas as pd\n", "import json\n", "import re\n", "import unicodedata\n", "import numpy as np\n", "data = pd.read_csv(\"./umzugQuotationsSampleWithResponse.csv\", on_bad_lines='skip',sep=\";\")\n", "#imagecount = pd.read_csv(\"./quotationsSample.csv\", on_bad_lines='skip',sep=\";\")['n_Images']\n", "#data['n_Images'] = imagecount\n", "data[\"inquired\"] = data[\"inquired\"].apply(np.datetime64)\n", "\n", "import re, json\n", "def normalize_col(name: str) -> str:\n", " s = unicodedata.normalize(\"NFKC\", str(name)) # unify unicode\n", " s = s.replace(\"\\xa0\", \" \") # NBSP -> space\n", " s = re.sub(r\"\\s+\", \" \", s).strip() # collapse spaces\n", " s = re.sub(r\":+\\s*$\", \"\", s) # drop trailing colons\n", "\n", " return s\n", "def extract_json_from_response(raw: str) -> dict | None:\n", " \"\"\"\n", " Extract the JSON object that appears in content='...'.\n", " Returns a dict or None if not found / invalid.\n", " \"\"\"\n", " if not isinstance(raw, str):\n", " return None\n", "\n", " # 1) Prefer: content=' {...} ' or content=\" {...} \"\n", " m = re.search(r\"content=(?P['\\\"])(?P\\{.*?\\})(?P=q)\", raw, flags=re.DOTALL)\n", " if m:\n", " json_str = m.group(\"body\")\n", " try:\n", " # --- minimal normalization: collapse backslash runs before a quote to \\\" ---\n", " json_str = re.sub(r'\\\\+\"', r'\\\"', json_str) # <-- CHANGED\n", " return json.loads(json_str) # <-- CHANGED (removed early return of raw string)\n", " except json.JSONDecodeError:\n", " pass # fall through to brace-balanced fallback\n", "\n", " # 2) Fallback: find the first '{' after 'content=' and parse a balanced JSON object\n", " m2 = re.search(r\"content=([\\'\\\"])?.*?(\\{)\", raw, flags=re.DOTALL)\n", " if not m2:\n", " return None\n", "\n", " start = m2.start(2) # index of first '{'\n", " # Walk to matching closing '}' while tracking nesting\n", " depth = 0\n", " i = start\n", " in_string = False\n", " esc = False\n", " while i < len(raw):\n", " ch = raw[i]\n", " if in_string:\n", " if esc:\n", " esc = False\n", " elif ch == '\\\\':\n", " esc = True\n", " elif ch == '\"':\n", " in_string = False\n", " else:\n", " if ch == '\"':\n", " in_string = True\n", " elif ch == '{':\n", " depth += 1\n", " elif ch == '}':\n", " depth -= 1\n", " if depth == 0:\n", " json_str = raw[start:i+1]\n", " try:\n", " # --- same minimal normalization here ---\n", " json_str = re.sub(r'\\\\+\"', r'\\\"', json_str) # <-- CHANGED\n", " return json.loads(json_str) # <-- CHANGED (removed print/early return)\n", " except json.JSONDecodeError:\n", " return None\n", " i += 1\n", " return None\n", "\n", "# --- Example: single row ---\n", "raw = data.loc[1, \"response\"]\n", "parsed = extract_json_from_response(raw)\n", "if parsed is None:\n", " raise ValueError(\"Could not extract valid JSON from response cell.\")\n", "df_one = pd.DataFrame([parsed])\n", "print(df_one)\n", "\n", "# --- Expand ALL rows into columns ---\n", "parsed_rows = [extract_json_from_response(x) or {} for x in data[\"response\"]]\n", "expanded = pd.DataFrame(parsed_rows)\n", "\n", "# Normalize column names\n", "expanded.columns = [normalize_col(c) for c in expanded.columns]\n", "\n", "# Coalesce duplicate columns (row-wise first non-null)\n", "def coalesce_dupe_cols(df: pd.DataFrame) -> pd.DataFrame:\n", " out = {}\n", " for col in dict.fromkeys(df.columns): # preserves original order\n", " same = [c for c in df.columns if c == col]\n", " if len(same) == 1:\n", " out[col] = df[same[0]]\n", " else:\n", " out[col] = df[same].bfill(axis=1).iloc[:, 0] # pick first non-null per row\n", " return pd.DataFrame(out, index=df.index)\n", "\n", "expanded = coalesce_dupe_cols(expanded)\n", "\n", "# (optional) If you’d rather just drop duplicates and keep the first:\n", "# expanded = expanded.loc[:, ~expanded.columns.duplicated()]\n", "\n", "# Prefix to avoid collisions with original data\n", "expanded = expanded.add_prefix(\"resp_\")\n", "\n", "data_expanded = pd.concat(\n", " [data.reset_index(drop=True), expanded.reset_index(drop=True)], axis=1\n", ")\n", "\n", "\n", "data_expanded = pd.DataFrame(data_expanded)\n", "data_expanded = data_expanded.loc[data_expanded[\"resp_duration\"].notna()]\n", "data_expanded = data_expanded.loc[data_expanded[\"resp_duration\"]!=\"0\"]\n", "\n", "data_expanded[\"resp_duration\"] = pd.to_numeric(data_expanded[\"resp_duration\"])\n", "data_expanded[\"diff_duration\"] = data_expanded[\"duration\"] - data_expanded[\"resp_duration\"]#-50.88\n" ] }, { "cell_type": "code", "execution_count": null, "id": "d098d74e-0bad-49ea-a770-74501358c40f", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "bb4a7f8c-784e-4c50-866a-da19625950cc", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "d5281345-9528-49ff-8e34-741ddeebbbba", "metadata": {}, "outputs": [], "source": [ "data_expanded[\"diff_duration\"]" ] }, { "cell_type": "code", "execution_count": 3, "id": "539a367a-dd9a-441f-9a6b-5563422b99d1", "metadata": {}, "outputs": [ { "data": { "image/png": "", "text/plain": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "from plot import plotVariables\n", "plotVariables(data_expanded[\"duration\"],data_expanded[\"resp_duration\"])" ] }, { "cell_type": "code", "execution_count": 6, "id": "584ab790-e5b7-4d52-af05-39f8d2f1692d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'role=\\'assistant\\' content=\\'{\"duration\":\"25\",\"ZusatzInfo\":\"Bitte geben Sie an, ob zusätzlich Fensterreinigung, Teppichreinigung, Bodenpflege, etc. erforderlich sind, sowie die genaue Aufteilung der 50\\\\u202fqm auf die einzelnen Räume.\",\"complete\":\"0\",\"confidence\":\"0.3\",\"missverständliche Aspekte\":\"Nicht spezifiziert, ob die 50\\\\u202fqm die gesamte Raumfläche oder die zu reinigende Bodenfläche umfasst; fehlende Angaben zu Fenstern, Bodenbelag, etc.\",\"Zu erbringende Leistungen:\":\"Reinigung aller Räume (Boden, Oberflächen), Grundreinigung.\",\"Rechenweg\":\"Annahme 5 Minuten pro 10\\\\u202fqm → 50\\\\u202fqm / 10 = 5 Einheiten × 5\\\\u202fMinuten = 25\\\\u202fMinuten.\",\"Kommentare\":\"Beschreibung unvollständig; weitere Angaben zu Fenstern, Bodenbelag und spezifischen Anforderungen nötig.\"}\\' thinking=\\'We need to respond only in JSON with specified fields: duration, ZusatzInfo, complete, confidence, missverständliche Aspekte, Zu erbringende Leistungen:, Rechenweg, Kommentare.\\\\n\\\\nThe user gave description: \"50qm Alle Räume bitte reinigen Küche und Wohnzimmer Abstellraum Schlafzimmer Badezimmer Wc Am 28.6.23\". It\\\\\\'s not specifying which specific tasks or details; only that total area is 50 qm and all rooms to be cleaned. We must interpret. But tasks are \"reinigungsaufträge zur Endreinigung\" for end cleaning after occupant moved out. The example durations given: windows, balcony, etc. But user didn\\\\\\'t mention windows, floor area? There\\\\\\'s mention of 50 qm total area. So we can assume floor cleaning area maybe 50 qm. But we don\\\\\\'t have tasks like windows, etc. So we need to interpret.\\\\n\\\\nWe must check if description is complete: It lists rooms but not details. We only have area 50qm, not each room area. So it\\\\\\'s incomplete. Should ask for more info. But per instruction: If no extra info needed, leave field empty. But we do need extra info: like details of windows, floor area, etc. So we should set ZusatzInfo empty? Wait \"Gibt es zusätzliche Informationen die du vom Kunden anfordern möchtest? Lass das feld leer, wenn nicht.\" So we can ask additional info. But the field \"ZusatzInfo\" is for extra info that we need? It says \"Gibt es zusätzliche Informationen die du vom Kunden anfordern möchtest? Lass das Feld leer, wenn nicht.\" So we should fill \"ZusatzInfo\" with the question we want to ask: like \"Bitte geben Sie an, ob zusätzliche Leistungen wie Fensterreinigung, Teppichreinigung etc. erforderlich sind.\" So we need to provide a question.\\\\n\\\\nWe must provide \"complete\" as 1 if description complete else 0. It\\\\\\'s incomplete, so 0.\\\\n\\\\nConfidence: how confident we are in interpretation? Maybe 0.3.\\\\n\\\\nMissverständliche Aspekte: likely lack of detail on area per room, whether windows included, floor area.\\\\n\\\\nZu erbringende Leistungen: we can list generic: Reinigung aller Räume (floor, surfaces), but we need to estimate duration based on area? They gave 50qm. Example durations: windows 20 min, balcony 20, altbau window 30, etc. For floor cleaning: Not specified. We might estimate 10 min per 10 sqm? Not given. But we can approximate? But no instructions for floor cleaning duration. The schema demands \"duration\": \"quoted integer\". So we must produce integer minutes. We could guess maybe 5 min per 10 sqm => 25 min. But uncertain. Since incomplete, we might leave duration as \"?\" But must provide integer. We can compute maybe 10 min per 10 sqm => 50sqm -> 50 min? Or 1 min per sqm? Not defined. Maybe safe: 5 min per 10sqm => 25 min. But mention uncertain. Provide explanation.\\\\n\\\\nRechenweg: explain calculation.\\\\n\\\\nKommentare: maybe note we need more info.\\\\n\\\\nSo produce JSON:\\\\n\\\\n{\\\\n \"duration\": \"25\",\\\\n \"ZusatzInfo\": \"Bitte geben Sie an, ob zusätzlich Fensterreinigung, Teppichreinigung, Bodenpflege, etc. erforderlich sind, sowie die genaue Aufteilung der 50 qm auf die einzelnen Räume.\",\\\\n \"complete\": \"0\",\\\\n \"confidence\": \"0.3\",\\\\n \"missverständliche Aspekte\": \"Nicht spezifiziert, ob die 50 qm die gesamte Raumfläche oder die zu reinigende Bodenfläche umfasst; fehlende Angaben zu Fenstern, Bodenbelag, etc.\",\\\\n \"Zu erbringende Leistungen:\": \"Reinigung aller Räume (Boden, Oberflächen), Grundreinigung.\",\\\\n \"Rechenweg\": \"Annahme 5 Minuten pro 10 qm → 50 qm / 10 = 5 Einheiten × 5 Minuten = 25 Minuten.\",\\\\n \"Kommentare\": \"Beschreibung unvollständig; weitere Angaben zu Fenstern, Bodenbelag und spezifischen Anforderungen nötig.\"\\\\n}\\\\n\\\\nMake sure JSON string quoting correct.\\' images=None tool_calls=None'" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_expanded.loc[1,\"response\"]" ] }, { "cell_type": "code", "execution_count": null, "id": "9a522e83-df13-4cc9-a566-4bc1131a6e6e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "fc17b8bd-c764-429b-b59c-c81e38bdc19e", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "e8ba7982-709d-4f89-8bdf-b66d9e1f1610", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "58d01dcb-0ee5-428e-8d44-02db4c243389", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "74a72a6a-daed-48bf-9da5-594567fc8087", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ea0f8f96-a3a5-4fcf-9567-28b9eb5ade74", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "921d4d2b-a01a-4f39-89f4-00ed5193df63", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c934a31d-c55f-406e-a381-fd6cf9778dca", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "08eac323-951f-45f4-b2c7-925ea92c269e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.11" } }, "nbformat": 4, "nbformat_minor": 5 }