clip/server.py
Go to the documentation of this file.
1 import clip
2 import cv2
3 import numpy as np
4 import os
5 from PIL import Image as PLImage
6 import torch
7 
8 # web server
9 from flask import Flask, request, Response
10 import json
11 import base64
12 
13 
14 def apply_half(t):
15  if t.dtype is torch.float32:
16  return t.to(dtype=torch.half)
17  return t
18 
19 class Inference:
20  def __init__(self, gpu_id=None):
21  self.gpu_id = gpu_id
22  self.device = "cuda" if torch.cuda.is_available() else "cpu"
23  self.model, self.preprocess = clip.load('ViT-B/32', self.device)
24 
25  def infer(self, img, texts):
26  # get cv2 image
27  image = cv2.resize(img, dsize=(640, 480)) # NOTE forcely
28  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
29  image = PLImage.fromarray(image)
30  image_input = self.preprocess(image).unsqueeze(0).to(self.device)
31  text_inputs = torch.cat([clip.tokenize(c) for c in texts]).to(self.device)
32  with torch.no_grad():
33  image_features = self.model.encode_image(image_input)
34  text_features = self.model.encode_text(text_inputs)
35  image_features /= image_features.norm(dim=-1, keepdim=True)
36  text_features /= text_features.norm(dim=-1, keepdim=True)
37  probability = (100.0 * image_features @ text_features.T).softmax(dim=-1)
38  similarity = (text_features.cpu().numpy() @ image_features.cpu().numpy().T).T[0] # cosine similarity
39  values, indices = probability[0].topk(len(texts))
40  results = {}
41  for value, index in zip(values, indices):
42  results[texts[index]] = (value.item(), float(similarity[index]))
43  return results
44 
45 # run
46 if __name__ == "__main__":
47  app = Flask(__name__)
48  infer = Inference()
49 
50  @app.route("/inference", methods=['POST'])
52  data = request.data.decode("utf-8")
53  data_json = json.loads(data)
54  # process image
55  image_b = data_json['image']
56  image_dec = base64.b64decode(image_b)
57  data_np = np.fromstring(image_dec, dtype='uint8')
58  img = cv2.imdecode(data_np, 1)
59  # get text
60  texts = data_json['queries']
61  infer_results = infer.infer(img, texts)
62  results = []
63  for q in infer_results:
64  results.append({"question": q, "probability": infer_results[q][0], "similarity": infer_results[q][1]})
65  return Response(response=json.dumps({"results": results}), status=200)
66 
67  app.run("0.0.0.0", 8080, threaded=True)
server.Inference.device
device
Definition: clip/server.py:22
ssd_train_dataset.float
float
Definition: ssd_train_dataset.py:180
server.caption_request
def caption_request()
Definition: clip/server.py:51
server.Inference.gpu_id
gpu_id
Definition: clip/server.py:21
server.Inference
Definition: clip/server.py:19
server.Inference.preprocess
preprocess
Definition: clip/server.py:23
server.Inference.encode_text
def encode_text(self, text, length=None, append_bos=False, append_eos=False)
Definition: ofa/server.py:127
server.apply_half
def apply_half(t)
Definition: clip/server.py:14
server.Inference.__init__
def __init__(self, gpu_id=None)
Definition: clip/server.py:20
server.Inference.infer
def infer(self, img, texts)
Definition: clip/server.py:25


jsk_perception
Author(s): Manabu Saito, Ryohei Ueda
autogenerated on Fri May 16 2025 03:11:17