mdz/pytorch/yolov9_pan/1_scripts/yolov9_utils.py

678 lines
23 KiB
Python

from distutils.command import sdist
import numpy as np
from PIL import Image, ImageDraw, ImageFont
import cv2
from pathlib import Path
# when you do not want to import torch
class DFL:
# DFL module
def __init__(self, c1=17):
self.c1 = c1
self.conv_weights = np.arange(c1, dtype=np.float32).reshape(1, c1, 1, 1)
def softmax(self, x, axis):
e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
return e_x / np.sum(e_x, axis=axis, keepdims=True)
def conv2d(self, x, weights):
# Assuming a 1x1 convolution for simplicity
return np.sum(x * weights, axis=1, keepdims=True)
def forward(self, x):
b, c, a = x.shape # batch, channels, anchors
x = x.reshape(b, 4, self.c1, a).transpose(0, 2, 1, 3) # reshape and transpose
x = self.softmax(x, axis=1) # apply softmax along the specified axis
x = self.conv2d(x, self.conv_weights) # apply 1x1 convolution
return x.reshape(b, 4, a) # reshape back to the desired output shape
def make_anchors(feats, strides, grid_cell_offset=0.5):
# Generate anchors from features using NumPy
anchor_points = []
stride_tensor = []
assert feats is not None
for i, stride in enumerate(strides):
_, _, h, w = feats[i].shape
sx = np.arange(w) + grid_cell_offset # shift x
sy = np.arange(h) + grid_cell_offset # shift y
sy, sx = np.meshgrid(sy, sx, indexing='ij')
anchor_points.append(np.stack((sx, sy), -1).reshape(-1, 2))
stride_tensor.append(np.full((h * w, 1), stride))
return np.concatenate(anchor_points), np.concatenate(stride_tensor)
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
# Transform distance(ltrb) to box(xywh or xyxy) using NumPy
lt, rb = np.split(distance, 2, axis=dim)
x1y1 = anchor_points - lt
x2y2 = anchor_points + rb
if xywh:
c_xy = (x1y1 + x2y2) / 2
wh = x2y2 - x1y1
return np.concatenate((c_xy, wh), axis=dim) # xywh bbox
return np.concatenate((x1y1, x2y2), axis=dim) # xyxy bbox
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def nms(boxes, scores, iou_threshold):
"""
Implements Non-Maximum Suppression (NMS) to filter the predictions.
Args:
boxes (numpy.ndarray): Array of shape (n, 4) where n is the number of predicted boxes.
Each row represents [x1, y1, x2, y2].
scores (numpy.ndarray): Array of shape (n,) where n is the number of predicted boxes.
Each element represents the confidence score of the corresponding box.
iou_threshold (float): IOU threshold for NMS.
Returns:
List[int]: List of indices of the boxes to keep.
"""
# If no boxes, return an empty list
if len(boxes) == 0:
return []
# Get the coordinates of the boxes
x1 = boxes[:, 0]
y1 = boxes[:, 1]
x2 = boxes[:, 2]
y2 = boxes[:, 3]
# Compute the area of the boxes
areas = (x2 - x1) * (y2 - y1)
# Sort the boxes by scores in descending order
order = scores.argsort()[::-1]
keep = [] # List to keep the indices of boxes to keep
# Iterate through all boxes
for i in range(len(order)):
if order.size == 0:
break
idx = order[0] # Index of the current box with the highest score
keep.append(idx)
# Get the coordinates of the intersection boxes
xx1 = np.maximum(x1[idx], x1[order[1:]])
yy1 = np.maximum(y1[idx], y1[order[1:]])
xx2 = np.minimum(x2[idx], x2[order[1:]])
yy2 = np.minimum(y2[idx], y2[order[1:]])
# Compute the width and height of the intersection boxes
w = np.maximum(0, xx2 - xx1)
h = np.maximum(0, yy2 - yy1)
# Compute the intersection over union (IoU)
inter = w * h
iou = inter / (areas[idx] + areas[order[1:]] - inter)
# Keep boxes with IoU less than the threshold
inds = np.where(iou <= iou_threshold)[0]
order = order[inds + 1]
return keep
def compute_iou(boxes1, boxes2):
"""Compute IOU between two sets of boxes."""
# Compute intersection area
x1 = np.maximum(boxes1[:, 0], boxes2[:, 0])
y1 = np.maximum(boxes1[:, 1], boxes2[:, 1])
x2 = np.minimum(boxes1[:, 2], boxes2[:, 2])
y2 = np.minimum(boxes1[:, 3], boxes2[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
# Compute union area
area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
union = area1 + area2 - intersection
# Compute IOU
iou = intersection / np.maximum(union, 1e-6)
return iou
def xywh2xyxy(x):
# Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
y = np.copy(x)
y[:, 0] = x[:, 0] - x[:, 2] / 2 # top left x
y[:, 1] = x[:, 1] - x[:, 3] / 2 # top left y
y[:, 2] = x[:, 0] + x[:, 2] / 2 # bottom right x
y[:, 3] = x[:, 1] + x[:, 3] / 2 # bottom right y
return y
def non_max_suppression(
prediction,
conf_thres=0.25,
iou_thres=0.45,
classes=None,
multi_label=False,
labels=(),
max_det=300,
nm=0, # number of masks
):
"""Non-Maximum Suppression (NMS) on inference results to reject overlapping detections
Returns:
list of detections, on (n,6) tensor per image [xyxy, conf, cls]
"""
if isinstance(prediction, (list, tuple)): # YOLO model in validation model, output = (inference_out, loss_out)
prediction = prediction[0] # select only inference output
bs = prediction.shape[0] # batch size
nc = prediction.shape[1] - nm - 4 # number of classes (84+32-32-4)
mi = 4 + nc # mask start index
xc = prediction[:, 4:mi].max(1) > conf_thres # candidates
# Checks
assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'
# Settings
# min_wh = 2 # (pixels) minimum box width and height
max_wh = 7680 # (pixels) maximum box width and height
max_nms = 30000 # maximum number of boxes into torchvision.ops.nms()
multi_label &= nc > 1 # multiple labels per box (adds 0.5ms/img)
output = [np.zeros((0, 6 + nm))] * bs
for xi, x in enumerate(prediction): # image index, image inference
# Apply constraints
# x[((x[:, 2:4] < min_wh) | (x[:, 2:4] > max_wh)).any(1), 4] = 0 # width-height
x = x.T[xc[xi]] # confidence
# Cat apriori labels if autolabelling
if labels and len(labels[xi]):
lb = labels[xi]
v = np.zeros((len(lb), nc + nm + 5))
v[:, :4] = lb[:, 1:5] # box
v[np.arange(len(lb)), lb[:, 0].astype(int) + 4] = 1.0 # cls
x = np.concatenate((x, v), axis=0)
# If none remain process next image
if not x.shape[0]:
continue
# Detections matrix nx6 (xyxy, conf, cls)
box = x[:, :4]
cls = x[:, 4:4 + nc]
mask = x[:, 4 + nc:]
box = xywh2xyxy(box) # center_x, center_y, width, height) to (x1, y1, x2, y2)
if multi_label:
i, j = np.nonzero(cls > conf_thres)
x = np.concatenate((box[i], x[i, 4 + j, None], j[:, None].astype(float), mask[i]), axis=1)
else: # best class only
conf = np.amax(cls, axis=1, keepdims=True)
j = np.argmax(cls, axis=1, keepdims=True).astype(float)
x = np.concatenate((box, conf, j, mask), axis=1) # 4+1+1+32 = 38
x = x[conf.flatten() > conf_thres]
# Filter by class
if classes is not None:
x = x[(x[:, 5:6] == np.array(classes)).any(axis=1)]
# Check shape
n = x.shape[0] # number of boxes
if not n: # no boxes
continue
elif n > max_nms: # excess boxes
x = x[x[:, 4].argsort()[::-1][:max_nms]] # sort by confidence
else:
x = x[x[:, 4].argsort()[::-1]] # sort by confidence
# Batched NMS
c = x[:, 5:6] * max_wh # classes
boxes, scores = x[:, :4] + c, x[:, 4] # boxes(offset by class), scores
i = nms(boxes, scores, iou_thres)
if len(i) > max_det: # limit detections
i = i[:max_det]
output[xi] = x[i]
return output
COCO_CLASSES = (
"person",
"bicycle",
"car",
"motorcycle",
"airplane",
"bus",
"train",
"truck",
"boat",
"traffic light",
"fire hydrant",
"stop sign",
"parking meter",
"bench",
"bird",
"cat",
"dog",
"horse",
"sheep",
"cow",
"elephant",
"bear",
"zebra",
"giraffe",
"backpack",
"umbrella",
"handbag",
"tie",
"suitcase",
"frisbee",
"skis",
"snowboard",
"sports ball",
"kite",
"baseball bat",
"baseball glove",
"skateboard",
"surfboard",
"tennis racket",
"bottle",
"wine glass",
"cup",
"fork",
"knife",
"spoon",
"bowl",
"banana",
"apple",
"sandwich",
"orange",
"broccoli",
"carrot",
"hot dog",
"pizza",
"donut",
"cake",
"chair",
"couch",
"potted plant",
"bed",
"dining table",
"toilet",
"tv",
"laptop",
"mouse",
"remote",
"keyboard",
"cell phone",
"microwave",
"oven",
"toaster",
"sink",
"refrigerator",
"book",
"clock",
"vase",
"scissors",
"teddy bear",
"hair drier",
"toothbrush",
)
def clip_boxes(boxes, shape):
# Clip boxes (xyxy) to image shape (height, width)
# np.array (faster grouped)
boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1]) # x1, x2
boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0]) # y1, y2
def scale_boxes(img1_shape, boxes, img0_shape, ratio_pad=None):
# Rescale boxes (xyxy) from img1_shape to img0_shape
if ratio_pad is None: # calculate from img0_shape
gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) # gain = old / new
pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2 # wh padding
else:
gain = ratio_pad[0][0]
pad = ratio_pad[1]
boxes[:, [0, 2]] -= pad[0] # x padding
boxes[:, [1, 3]] -= pad[1] # y padding
boxes[:, :4] /= gain
clip_boxes(boxes, img0_shape)
return boxes
def crop_mask(masks, boxes):
"""
"Crop" predicted masks by zeroing out everything not in the predicted bbox.
Args:
- masks: a size [h, w, n] array of masks
- boxes: a size [n, 4] array of bbox coords in relative point form
Returns:
- cropped_masks: a size [h, w, n] array of cropped masks
"""
n, h, w = masks.shape
x1, y1, x2, y2 = np.split(boxes[:, None, :], 4, axis=2) # x1 shape(1,1,n)
r = np.arange(w)[None, None, :] # rows shape(1,w,1)
c = np.arange(h)[None, :, None] # cols shape(h,1,1)
return masks * ((r >= x1) * (r < x2) * (c >= y1) * (c < y2))
def process_mask(protos, masks_in, bboxes, shape, upsample=False):
"""
Crop before upsample.
protos: [mask_dim, mask_h, mask_w]
masks_in: [n, mask_dim], n is number of masks after NMS
bboxes: [n, 4], n is number of masks after NMS
shape: input_image_size, (h, w)
upsample: Whether to upsample the masks
Returns:
masks: [h, w, n]
"""
c, mh, mw = protos.shape # CHW
ih, iw = shape
# mask_in check is ok!
masks = sigmoid(np.dot(masks_in, protos.reshape(c, -1))).reshape(-1, mh, mw)
# masks check is ok!
downsampled_bboxes = bboxes.copy()
downsampled_bboxes[:, 0] *= mw / iw
downsampled_bboxes[:, 2] *= mw / iw
downsampled_bboxes[:, 3] *= mh / ih
downsampled_bboxes[:, 1] *= mh / ih
masks = crop_mask(masks, downsampled_bboxes) # nHW
if upsample:
masks = cv2.resize(masks.transpose(1, 2, 0), (iw, ih), interpolation=cv2.INTER_LINEAR) # HWn
return (masks > 0.5).astype(np.uint8) # Convert to binary masks
class Colors:
# Ultralytics color palette https://ultralytics.com/
def __init__(self):
# hex = matplotlib.colors.TABLEAU_COLORS.values()
hexs = ('FF3838', 'FF9D97', 'FF701F', 'FFB21D', 'CFD231', '48F90A', '92CC17', '3DDB86', '1A9334', '00D4BB',
'2C99A8', '00C2FF', '344593', '6473FF', '0018EC', '8438FF', '520085', 'CB38FF', 'FF95C8', 'FF37C7')
self.palette = [self.hex2rgb(f'#{c}') for c in hexs]
self.n = len(self.palette)
def __call__(self, i, bgr=False):
c = self.palette[int(i) % self.n]
return (c[2], c[1], c[0]) if bgr else c
@staticmethod
def hex2rgb(h): # rgb order (PIL)
return tuple(int(h[1 + i:1 + i + 2], 16) for i in (0, 2, 4))
colors = Colors() # create instance for 'from utils.plots import colors'
def box_label(im, box, label='', color=(128, 128, 128), txt_color=(255, 255, 255)):
p1, p2 = (int(box[0]), int(box[1])), (int(box[2]), int(box[3]))
cv2.rectangle(im, p1, p2, color, thickness=2 , lineType=cv2.LINE_AA)
if label:
tf = 2 # font thickness
w, h = cv2.getTextSize(label, 0, fontScale=3 / 3, thickness=tf)[0] # text width, height
outside = p1[1] - h >= 3
p2 = p1[0] + w, p1[1] - h - 3 if outside else p1[1] + h + 3
cv2.rectangle(im, p1, p2, color, -1, cv2.LINE_AA) # filled
cv2.putText(im,
label, (p1[0], p1[1] - 2 if outside else p1[1] + h + 2),
0,
3 / 3,
txt_color,
thickness=tf,
lineType=cv2.LINE_AA)
return np.asarray(im)
def scale_image(im1_shape, masks, im0_shape, ratio_pad=None):
"""
img1_shape: model input shape, [h, w]
img0_shape: origin pic shape, [h, w, 3]
masks: [h, w, num]
"""
# Rescale coordinates (xyxy) from im1_shape to im0_shape
if ratio_pad is None: # calculate from im0_shape
gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
else:
pad = ratio_pad[1]
top, left = int(pad[1]), int(pad[0]) # y, x
bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
if len(masks.shape) < 2:
raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
masks = masks[top:bottom, left:right]
# masks shape: [h, w, n]
masks = cv2.resize(masks, (im0_shape[1], im0_shape[0]))
if len(masks.shape) == 2:
masks = masks[:, :, None]
return masks
def crop_image(im1_shape, masks, im0_shape, ratio_pad=None):
"""
img1_shape: model input shape, [h, w]
img0_shape: origin pic shape, [h, w, 3]
masks: [h, w, num]
"""
# Rescale coordinates (xyxy) from im1_shape to im0_shape
if ratio_pad is None: # calculate from im0_shape
gain = min(im1_shape[0] / im0_shape[0], im1_shape[1] / im0_shape[1]) # gain = old / new
pad = (im1_shape[1] - im0_shape[1] * gain) / 2, (im1_shape[0] - im0_shape[0] * gain) / 2 # wh padding
else:
pad = ratio_pad[1]
top, left = int(pad[1]), int(pad[0]) # y, x
bottom, right = int(im1_shape[0] - pad[1]), int(im1_shape[1] - pad[0])
if len(masks.shape) < 2:
raise ValueError(f'"len of masks shape" should be 2 or 3, but got {len(masks.shape)}')
masks = masks[top:bottom, left:right]
return masks
def draw_masks(im, masks, colors, alpha=0.5):
"""Plot masks at once.
Args:
masks (array): predicted masks , shape: [h, w, n]
colors (List[List[Int]]): colors for predicted masks, [[r, g, b] * n]
alpha (float): mask transparency: 0.0 fully transparent, 1.0 opaque
"""
if len(masks) == 0:
return
masks = scale_image(masks.shape[:2], masks, im.shape) # scale masks to im's shape
masks = np.asarray(masks, dtype=np.float32)
colors = np.asarray(colors, dtype=np.float32) # shape(n,3)
s = masks.sum(2, keepdims=True).clip(0, 1) # add all masks together
masks = (masks @ colors).clip(0, 255) # (h,w,n) @ (n,3) = (h,w,3)
im[:] = masks * alpha + im * (1 - s * alpha)
return np.asarray(im)
# coco id
all_instances_ids = [
1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 13, 14, 15, 16, 17, 18, 19, 20,
21, 22, 23, 24, 25, 27, 28,
31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
41, 42, 43, 44, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
61, 62, 63, 64, 65, 67, 70,
72, 73, 74, 75, 76, 77, 78, 79, 80,
81, 82, 84, 85, 86, 87, 88, 89, 90,
]
all_stuff_ids = [
92, 93, 94, 95, 96, 97, 98, 99, 100,
101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
141, 142, 143, 144, 145, 146, 147, 148, 149, 150,
151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
171, 172, 173, 174, 175, 176, 177, 178, 179, 180,
181, 182,
# other
183,
# unlabeled
0,
]
# panoptic id: https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
panoptic_stuff_ids = [
92, 93, 95, 100,
107, 109,
112, 118, 119,
122, 125, 128, 130,
133, 138,
141, 144, 145, 147, 148, 149,
151, 154, 155, 156, 159,
161, 166, 168,
171, 175, 176, 177, 178, 180,
181, 184, 185, 186, 187, 188, 189, 190,
191, 192, 193, 194, 195, 196, 197, 198, 199, 200,
# unlabeled
0,
]
def getCocoIds(name = 'semantic'):
if 'instances' == name:
return all_instances_ids
elif 'stuff' == name:
return all_stuff_ids
elif 'panoptic' == name:
return all_instances_ids + panoptic_stuff_ids
else: # semantic
return all_instances_ids + all_stuff_ids
def getMappingId(index, name = 'semantic'):
ids = getCocoIds(name = name)
return ids[index]
def getMappingIndex(id, name = 'semantic'):
ids = getCocoIds(name = name)
return ids.index(id)
def panoptic_merge_show(semask, masks, labels, conf, min_area):
panoptic = np.zeros(semask.shape + (3,), dtype=np.int32)
stuff = np.zeros_like(semask)
unique_labels = np.unique(semask)
for _cls in unique_labels:
if _cls < 80:
stuff[semask == _cls] = 255
else:
stuff[semask == _cls] = getMappingId(_cls, 'semantic')
panoptic[:, :, 2] = stuff
panoptic[:, :, 0] = stuff
inst_id = 0
# merge inst
area = np.sum(masks, axis=(1,2))
# print(area.shape)
sorted_indices = np.argsort(area)[::-1]
# # print("**", sorted_indices)
masks = masks[sorted_indices]
labels = labels[sorted_indices]
conf = conf[sorted_indices]
# used = None
# print(labels)
for i in range(len(masks)):
valid_area = (masks[i] == 1)
panoptic[:, :, 1][valid_area] = getMappingId(int(labels[i]), 'instances') * 1000 + inst_id
panoptic[:, :, 2][valid_area] = getMappingId(int(labels[i]), 'instances') * 1000 + inst_id
panoptic[:, :, 0][valid_area] = getMappingId(int(labels[i]), 'instances')
inst_id += 1
# for _cls in np.unique(getMappingId(int(labels[i]), 'instances')):
# inst_id = 0
# for i in range(len(masks)):
# if labels[i] == _cls:
# valid_area = (masks[i] == 1)
# panoptic[:, :, 1][valid_area] = getMappingId(int(labels[i]), 'instances') * 1000 + inst_id
# panoptic[:, :, 2][valid_area] = getMappingId(int(labels[i]), 'instances') * 1000 + inst_id
# inst_id += 1
# merge stuff
stuff_map = panoptic[:, :, 1] == 0
stuff_cls = np.unique(panoptic[:, :, 2][stuff_map])
for _cls in stuff_cls:
stuff_seg = (panoptic[:, :, 2] == _cls).astype(np.uint8)
num, componets = cv2.connectedComponents(stuff_seg)
for i in range(num):
if i > 0:
com_map = componets == i
if np.count_nonzero(com_map) <= min_area:
panoptic[:, :, 2][com_map] = 255
panoptic[:, :, 0][com_map] = 255
# Convert 255 to Unlabeled
panoptic[panoptic == 255] = 0
# panoptic[:, :, 1] = panoptic[:, :, 1] // 256
# panoptic[:, :, 2] = panoptic[:, :, 2] % 256
# panoptic = panoptic.astype('uint8')
return panoptic
def panoptic_merge_coco(semask, masks, labels, min_area):
panoptic = np.zeros(semask.shape + (3,), dtype=np.int32)
stuff = np.zeros_like(semask)
unique_labels = np.unique(semask)
for _cls in unique_labels:
if _cls < 92:
stuff[semask == _cls] = 255
else:
stuff[semask == _cls] = _cls
panoptic[:, :, 2] = stuff
panoptic[:, :, 0] = stuff
# merge inst
inst_id = 0
for i in range(len(masks)):
valid_area = (masks[i] == 1)
panoptic[:, :, 1][valid_area] = labels[i] * 1000 + inst_id
# print(labels[i])
# print('debug.......',np.unique(panoptic[:, :, 1]))
panoptic[:, :, 2][valid_area] = labels[i] * 1000 + inst_id
panoptic[:, :, 0][valid_area] = labels[i]
inst_id += 1
# print('debug.......',np.unique(panoptic[:, :, 1]))
# for _cls in np.unique(labels):
# inst_id = 0
# imasks = masks[labels == _cls]
# for i, inst in enumerate(imasks):
# valid_area = (inst == 1)
# panoptic[:, :, 1][valid_area] = _cls * 1000 + inst_id
# panoptic[:, :, 2][valid_area] = _cls * 1000 + inst_id
# inst_id += 1
# merge stuff
stuff_map = panoptic[:, :, 1] == 0
stuff_cls = np.unique(panoptic[:, :, 2][stuff_map])
for _cls in stuff_cls:
stuff_seg = (panoptic[:, :, 2] == _cls).astype(np.uint8)
num, componets = cv2.connectedComponents(stuff_seg)
for i in range(num):
if i > 0:
com_map = componets == i
if np.count_nonzero(com_map) <= min_area:
panoptic[:, :, 2][com_map] = 255
# Convert 255 to Unlabeled
panoptic[panoptic == 255] = 0
# panoptic[:, :, 1] = panoptic[:, :, 1] // 256
# panoptic[:, :, 2] = panoptic[:, :, 2] % 256
# panoptic = panoptic.astype('uint8')
return panoptic