81 lines
2.5 KiB
Python
81 lines
2.5 KiB
Python
import pandas as pd
|
|
import json
|
|
import os
|
|
# generate short sample
|
|
booking = pd.read_csv("./quotations.csv", on_bad_lines='skip',sep=";")
|
|
#images = os.listdir("/Users/antonwirsing/Nextcloud/share_anton/ExtraSauber-quotations-training-data-2025-05-09")
|
|
images = os.listdir("/var/huggingface/data")
|
|
images_by_id = {}
|
|
for img in images:
|
|
id = img[0:6]
|
|
images_by_id.setdefault(id, []).append(img)
|
|
|
|
|
|
|
|
# Filter to just “Fensterreinigung” and drop rows missing requirements_textual
|
|
filtered = booking.loc[
|
|
(booking['product_name'] == "Fensterreinigung")
|
|
& booking['requirements_textual'].notna()
|
|
& booking['price'].notna()
|
|
& booking['quotation_info_request'].isna()
|
|
]
|
|
|
|
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
|
|
#print(df.columns)
|
|
|
|
# 4) Create the `images` column by JSON-dumping each list (or empty list)
|
|
df['images'] = (
|
|
df['id']
|
|
.astype(str)
|
|
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
|
|
)
|
|
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
|
|
print(df['n_Images'])
|
|
df.to_csv("./windowQuotationsSample.csv", index=False,sep=";")
|
|
|
|
|
|
|
|
# Filter to just “Umzugsreinigung” and drop rows missing requirements_textual
|
|
filtered = booking.loc[
|
|
(booking['product_name'] == "Umzugsreinigung")
|
|
& booking['requirements_textual'].notna()
|
|
& booking['price'].notna()
|
|
& booking['quotation_info_request'].isna()
|
|
]
|
|
|
|
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
|
|
#print(df.columns)
|
|
|
|
# 4) Create the `images` column by JSON-dumping each list (or empty list)
|
|
df['images'] = (
|
|
df['id']
|
|
.astype(str)
|
|
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
|
|
)
|
|
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
|
|
print(df['n_Images'])
|
|
df.to_csv("./umzugQuotationsSample.csv", index=False,sep=";")
|
|
|
|
|
|
|
|
|
|
# Filter to just “Intensivreinigung” and drop rows missing requirements_textual
|
|
filtered = booking.loc[
|
|
(booking['product_name'] == "Intensivreinigung")
|
|
& booking['requirements_textual'].notna()
|
|
& booking['price'].notna()
|
|
& booking['quotation_info_request'].isna()
|
|
]
|
|
|
|
df = filtered.sample(n=1000, random_state=43).reset_index(drop=True)
|
|
#print(df.columns)
|
|
|
|
# 4) Create the `images` column by JSON-dumping each list (or empty list)
|
|
df['images'] = (
|
|
df['id']
|
|
.astype(str)
|
|
.apply(lambda i: json.dumps(images_by_id.get(i, [])))
|
|
)
|
|
df['n_Images'] = df['images'].apply(lambda s: len(json.loads(s)))
|
|
print(df['n_Images'])
|
|
df.to_csv("./intensivQuotationsSample.csv", index=False,sep=";") |