From f3931305985afe7d4bcb7c2f3a3a4da509ee7f7a Mon Sep 17 00:00:00 2001 From: Anton Wirsing Date: Sun, 12 Nov 2023 19:40:27 +0100 Subject: [PATCH] init --- imageEncoder.py | 19 +++++++++++++++++++ run.py | 11 +++++++++++ 2 files changed, 30 insertions(+) create mode 100644 imageEncoder.py create mode 100644 run.py diff --git a/imageEncoder.py b/imageEncoder.py new file mode 100644 index 0000000..213cdd6 --- /dev/null +++ b/imageEncoder.py @@ -0,0 +1,19 @@ +import requests +from PIL import Image + +from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel + +# load a fine-tuned image captioning model and corresponding tokenizer and image processor +model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") + +# let's perform inference on an image +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +pixel_values = image_processor(image, return_tensors="pt").pixel_values + +# autoregressively generate caption (uses greedy decoding by default) +generated_ids = model.generate(pixel_values) +generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_text) diff --git a/run.py b/run.py new file mode 100644 index 0000000..85e3339 --- /dev/null +++ b/run.py @@ -0,0 +1,11 @@ + +from transformers import pipeline +#pipe = pipeline("text-generation", model="TheBloke/llava-v1.5-13B-AWQ",device_map="cuda:1") +pipe = pipeline("image_classification", model="TheBloke/llava-v1.5-13B-AWQ",device_map="cuda:1") + +images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + +pipe(images) + + +