import pandas as pd import json import os # generate short sample booking = pd.read_csv("./quotations.csv", on_bad_lines='skip',sep=";") #images = os.listdir("/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09") images = os.listdir("/var/huggingface/data") images_by_id = {} for img in images: id = img[0:6] images_by_id.setdefault(id, []).append(img) # Filter to just “Fensterreinigung” and drop rows missing requirements_textual filtered = booking.loc[ (booking['product_name'] == "Fensterreinigung") & booking['requirements_textual'].notna() & booking['price'].notna() & booking['quotation_info_request'].isna() ] df = filtered.sample(n=1000, random_state=43).reset_index(drop=True) #print(df.columns) # 4) Create the `images` column by JSON-dumping each list (or empty list) df['images'] = ( df['id'] .astype(str) .apply(lambda i: json.dumps(images_by_id.get(i, []))) ) df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s))) print(df['n_Images']) df.to_csv("./windowQuotationsSample.csv", index=False,sep=";") # Filter to just “Umzugsreinigung” and drop rows missing requirements_textual filtered = booking.loc[ (booking['product_name'] == "Umzugsreinigung") & booking['requirements_textual'].notna() & booking['price'].notna() & booking['quotation_info_request'].isna() ] df = filtered.sample(n=1000, random_state=43).reset_index(drop=True) #print(df.columns) # 4) Create the `images` column by JSON-dumping each list (or empty list) df['images'] = ( df['id'] .astype(str) .apply(lambda i: json.dumps(images_by_id.get(i, []))) ) df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s))) print(df['n_Images']) df.to_csv("./umzugQuotationsSample.csv", index=False,sep=";") # Filter to just “Intensivreinigung” and drop rows missing requirements_textual filtered = booking.loc[ (booking['product_name'] == "Intensivreinigung") & booking['requirements_textual'].notna() & booking['price'].notna() & booking['quotation_info_request'].isna() ] df = filtered.sample(n=1000, random_state=43).reset_index(drop=True) #print(df.columns) # 4) Create the `images` column by JSON-dumping each list (or empty list) df['images'] = ( df['id'] .astype(str) .apply(lambda i: json.dumps(images_by_id.get(i, []))) ) df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s))) print(df['n_Images']) df.to_csv("./intensivQuotationsSample.csv", index=False,sep=";")