Real-Time Object Detection¶
Objective: Implement pedestrian detection using a pretrained Faster R-CNN model.
Faster R-CNN is an advanced object detection model that improves upon its predecessor, Fast R-CNN, by incorporating a Region Proposal Network (RPN) which shares full-image convolutional features with the detection network, enabling nearly cost-free region proposals. This RPN is a fully convolutional network that simultaneously predicts object bounds and objectness scores at each position, trained end-to-end to generate high-quality region proposals. The Faster R-CNN architecture consists of two main modules: a deep fully convolutional network that proposes regions and a Fast R-CNN detector that uses these proposed regions for detection. Despite its name, Faster R-CNN is known for being more accurate but slower than some other models like YOLOv3 or MobileNet during inference.
Import libraries¶
import torch
from torchvision.transforms import Resize, functional
from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
from torchvision.io import decode_image
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
from PIL import ImageDraw, ImageFont
import os
Download the dataset¶
%%bash
wget -nc --progress=bar:force:noscroll https://www.cis.upenn.edu/~jshi/ped_html/PennFudanPed.zip -P /tmp
wget -nc --progress=bar:force:noscroll https://assets.ubuntu.com/v1/0cef8205-ubuntu-font-family-0.83.zip -P /tmp
unzip -qn /tmp/PennFudanPed.zip -d /tmp
unzip -qn /tmp/0cef8205-ubuntu-font-family-0.83.zip -d /tmp
File '/tmp/PennFudanPed.zip' already there; not retrieving. File '/tmp/0cef8205-ubuntu-font-family-0.83.zip' already there; not retrieving.
Visualize the dataset¶
The Penn-Fudan Database is a dataset used for pedestrian detection and segmentation. It contains 170 images with 345 instances of pedestrians, taken from scenes around the University of Pennsylvania and Fudan University. This dataset is often used in research and training of object detection and instance segmentation models.
path = "/tmp/PennFudanPed/PNGImages"
files = [os.path.join(path, file) for file in sorted(os.listdir(path)) if file.endswith(".png")]
resize = Resize(size=(450, 450))
frames = []
fig, ax = plt.subplots()
for file in files:
image = decode_image(file)
image = resize(image)
plt.tight_layout()
ax.set_title("Frames")
ax.axis("off")
frame = ax.imshow(image.permute(1, 2, 0), animated=True)
frames.append([frame])
anim = animation.ArtistAnimation(fig, frames, blit=True, repeat_delay=1000)
plt.close(fig)
HTML(anim.to_html5_video())
Load the pretrained Faster R-CNN¶
The model was trained using the COCO dataset.
# Run on the GPU or on the CPU, if a GPU is not available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using {device} device!\n")
# Load a model pretrained on COCO dataset
weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
model = fasterrcnn_resnet50_fpn(weights=weights).to(device)
model.eval()
print(model)
Using cuda device! FasterRCNN( (transform): GeneralizedRCNNTransform( Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) Resize(min_size=(800,), max_size=1333, mode='bilinear') ) (backbone): BackboneWithFPN( (body): IntermediateLayerGetter( (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (relu): ReLU(inplace=True) (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False) (layer1): Sequential( (0): Bottleneck( (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (1): FrozenBatchNorm2d(256, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(64, eps=0.0) (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(64, eps=0.0) (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(256, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer2): Sequential( (0): Bottleneck( (conv1): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(256, 512, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(512, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(128, eps=0.0) (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(128, eps=0.0) (conv3): Conv2d(128, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(512, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer3): Sequential( (0): Bottleneck( (conv1): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(512, 1024, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(1024, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (3): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (4): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) (5): Bottleneck( (conv1): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(256, eps=0.0) (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(256, eps=0.0) (conv3): Conv2d(256, 1024, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(1024, eps=0.0) (relu): ReLU(inplace=True) ) ) (layer4): Sequential( (0): Bottleneck( (conv1): Conv2d(1024, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) (downsample): Sequential( (0): Conv2d(1024, 2048, kernel_size=(1, 1), stride=(2, 2), bias=False) (1): FrozenBatchNorm2d(2048, eps=0.0) ) ) (1): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) ) (2): Bottleneck( (conv1): Conv2d(2048, 512, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn1): FrozenBatchNorm2d(512, eps=0.0) (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False) (bn2): FrozenBatchNorm2d(512, eps=0.0) (conv3): Conv2d(512, 2048, kernel_size=(1, 1), stride=(1, 1), bias=False) (bn3): FrozenBatchNorm2d(2048, eps=0.0) (relu): ReLU(inplace=True) ) ) ) (fpn): FeaturePyramidNetwork( (inner_blocks): ModuleList( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(1, 1), stride=(1, 1)) ) (1): Conv2dNormActivation( (0): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1)) ) (2): Conv2dNormActivation( (0): Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)) ) (3): Conv2dNormActivation( (0): Conv2d(2048, 256, kernel_size=(1, 1), stride=(1, 1)) ) ) (layer_blocks): ModuleList( (0-3): 4 x Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) ) ) (extra_blocks): LastLevelMaxPool() ) ) (rpn): RegionProposalNetwork( (anchor_generator): AnchorGenerator() (head): RPNHead( (conv): Sequential( (0): Conv2dNormActivation( (0): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)) (1): ReLU(inplace=True) ) ) (cls_logits): Conv2d(256, 3, kernel_size=(1, 1), stride=(1, 1)) (bbox_pred): Conv2d(256, 12, kernel_size=(1, 1), stride=(1, 1)) ) ) (roi_heads): RoIHeads( (box_roi_pool): MultiScaleRoIAlign(featmap_names=['0', '1', '2', '3'], output_size=(7, 7), sampling_ratio=2) (box_head): TwoMLPHead( (fc6): Linear(in_features=12544, out_features=1024, bias=True) (fc7): Linear(in_features=1024, out_features=1024, bias=True) ) (box_predictor): FastRCNNPredictor( (cls_score): Linear(in_features=1024, out_features=91, bias=True) (bbox_pred): Linear(in_features=1024, out_features=364, bias=True) ) ) )
Evaluate the model¶
def draw_boxes(image, prediction, score_threshold=0.9, font_size=14, box_width=5, colors=["white", "black"]):
categories = FasterRCNN_ResNet50_FPN_Weights.DEFAULT.meta["categories"]
indexes = [prediction['scores'] > score_threshold]
boxes = prediction["boxes"][indexes].to(torch.int64).tolist()
labels = [categories[i] for i in prediction["labels"][indexes]]
scores = (prediction['scores'][indexes] * 100).to(torch.int64).tolist()
labels_scores = [label + " " + str(score) + "%" for label, score in zip(labels, scores)]
pil_image = functional.to_pil_image(image)
draw = ImageDraw.Draw(pil_image)
font = ImageFont.truetype(font="/tmp/ubuntu-font-family-0.83/Ubuntu-B.ttf", size=font_size)
for box, label in zip(boxes, labels_scores):
draw.rectangle(box, width=box_width, outline=colors[0])
text_box = draw.textbbox((box[0], box[1] - font_size), label, font=font)
draw.rectangle(text_box, fill=colors[0])
draw.text((box[0], box[1] - font_size), label, fill=colors[1], font=font)
return pil_image
transforms = weights.transforms()
file = files[0]
image = decode_image(file)
image = resize(image)
with torch.no_grad():
X = transforms(image).to(device)
prediction = model([X])[0]
output_image = draw_boxes(image, prediction)
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].imshow(image.permute(1, 2, 0))
axs[0].set_title("Input image")
axs[0].axis("off")
axs[1].imshow(output_image)
axs[1].set_title("Output image")
axs[1].axis("off")
plt.show()
frames = []
fig, ax = plt.subplots()
for file in files:
image = decode_image(file)
image = resize(image)
with torch.no_grad():
X = transforms(image).to(device)
prediction = model([X])[0]
output_image = draw_boxes(image, prediction)
plt.tight_layout()
ax.set_title("Output frames")
ax.axis("off")
frame = ax.imshow(output_image, animated=True)
frames.append([frame])
anim = animation.ArtistAnimation(fig, frames, blit=True, repeat_delay=1000)
plt.close(fig)
HTML(anim.to_html5_video())