from PIL import Image, ImageDraw, ImageFont
import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
def get_model():
processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
return processor, model
def get_image(image_path):
return Image.open(image_path)
def get_texts(processor, image, look_for: str):
texts = [look_for]
inputs = processor(text=texts, images=image, return_tensors="pt")
return inputs, texts
def process(model, processor, inputs, image):
with torch.no_grad():
outputs = model(**inputs)
# Target image sizes (height, width) to rescale box predictions [batch_size, 2]
target_sizes = torch.Tensor([image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
return results
def draw_boxes(image, boxes, scores, labels, texts):
draw = ImageDraw.Draw(image)
font = ImageFont.load_default()
for box, score, label in zip(boxes, scores, labels):
box = [round(i, 2) for i in box.tolist()]
draw.rectangle(box, outline='red', width=2)
draw.text((box[0], box[1]), f"{texts[label]}: {round(score.item(), 2)}", fill='blue', font=font)
image.show()
def run_me(target_object: str, picture_url: str):
processor, model = get_model()
image = get_image(picture_url)
inputs, texts = get_texts(processor, image, target_object)
results = process(model, processor, inputs, image)
i = 0 # Retrieve predictions for the first image for the corresponding text queries
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
text = texts[i]
draw_boxes(image, boxes, scores, labels, text)
print(len(boxes))
for box, score, label in zip(boxes, scores, labels):
box = [round(i, 2) for i in box.tolist()]
return f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}"