huggingface/data.py

import pandas as pd
import json
import os
# generate short sample
booking = pd.read_csv("./quotations.csv", on_bad_lines='skip',sep=";")
#images = os.listdir("/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09")
images = os.listdir("/var/huggingface/data")
images_by_id = {}
for img in images:
    id = img[0:6]
    images_by_id.setdefault(id, []).append(img)


# Filter to just “Fensterreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
    (booking['product_name'] == "Fensterreinigung")
    & booking['requirements_textual'].notna()
    & booking['price'].notna()
    & booking['quotation_info_request'].isna()
]

df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)

# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
    df['id']
    .astype(str)
    .apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./windowQuotationsSample.csv", index=False,sep=";")


# Filter to just “Umzugsreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
    (booking['product_name'] == "Umzugsreinigung")
    & booking['requirements_textual'].notna()
    & booking['price'].notna()
    & booking['quotation_info_request'].isna()
]

df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)

# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
    df['id']
    .astype(str)
    .apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./umzugQuotationsSample.csv", index=False,sep=";")


# Filter to just “Intensivreinigung” and drop rows missing requirements_textual
filtered = booking.loc[
    (booking['product_name'] == "Intensivreinigung")
    & booking['requirements_textual'].notna()
    & booking['price'].notna()
    & booking['quotation_info_request'].isna()
]

df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
#print(df.columns)

# 4) Create the `images` column by JSON-dumping each list (or empty list)
df['images'] = (
    df['id']
    .astype(str)
    .apply(lambda i: json.dumps(images_by_id.get(i, [])))
)
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
print(df['n_Images'])
df.to_csv("./intensivQuotationsSample.csv", index=False,sep=";")