huggingface/data.py
2025-08-18 13:20:56 +02:00

81 lines
2.5 KiB
Python

import pandas as pd
import json
import os
# generate short sample
booking = pd.read_csv("./quotations.csv", on_bad_lines='skip',sep=";")
#images = os.listdir("/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09")
images = os.listdir("/var/huggingface/data")
images_by_id = {}
for img in images:
id = img[0:6]
images_by_id.setdefault(id, []).append(img)
# Filter to just “Fensterreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
(booking['product_name'] == "Fensterreinigung")
& booking['requirements_textual'].notna()
& booking['price'].notna()
& booking['quotation_info_request'].isna()
]
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)
# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
df['id']
.astype(str)
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./windowQuotationsSample.csv", index=False,sep=";")
# Filter to just “Umzugsreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
(booking['product_name'] == "Umzugsreinigung")
& booking['requirements_textual'].notna()
& booking['price'].notna()
& booking['quotation_info_request'].isna()
]
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)
# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
df['id']
.astype(str)
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./umzugQuotationsSample.csv", index=False,sep=";")
# Filter to just “Intensivreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
(booking['product_name'] == "Intensivreinigung")
& booking['requirements_textual'].notna()
& booking['price'].notna()
& booking['quotation_info_request'].isna()
]
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)
# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
df['id']
.astype(str)
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./intensivQuotationsSample.csv", index=False,sep=";")