After working with simpler object detection models, we will now implement a Region-based Convolutional Neural Network (RCNN) from scratch. Unlike YOLO, which directly predicts bounding boxes, RCNN uses a two-step process:
This project will cover:
First, install the required dependencies:
pip install torch torchvision opencv-python numpy matplotlib scikit-learn tqdm
or if you prefer conda...
conda install -c conda-forge cudatoolkit=12.6 cudnn=9.1.0
conda install pytorch torchvision torchaudio pytorch-cuda=12.4 -c pytorch -c nvidia
We use Pascal VOC 2007 since it provides labeled images with bounding boxes. Download the dataset:
import torchvision
from torchvision import transforms
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
# Define transformation for input images
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor()
])
# Load Pascal VOC dataset
dataset = torchvision.datasets.VOCDetection(root="./data", year="2007", image_set="train", download=True, transform=transform)
# View an image and its bounding boxes
image, target = dataset[0]
plt.imshow(image.permute(1, 2, 0)) # Convert from (C, H, W) to (H, W, C)
plt.show()
print(target)
Some images from Pascal VOC 2007 dataset
RCNN requires candidate region proposals, which we extract using Selective Search from OpenCV:
import cv2
import numpy as np
def get_region_proposals(image):
ss = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
ss.setBaseImage(image)
ss.switchToSelectiveSearchFast() # Faster version (less accurate)
rects = ss.process()
return rects[:2000] # Select top 2000 proposals
# Convert PIL image to OpenCV format
image_cv = cv2.cvtColor(np.array(image.permute(1, 2, 0)), cv2.COLOR_RGB2BGR)
regions = get_region_proposals(image_cv)
# Draw regions on the image
for (x, y, w, h) in regions[:50]: # Show first 50 proposals
cv2.rectangle(image_cv, (x, y), (x + w, y + h), (0, 255, 0), 1)
plt.imshow(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
plt.show()
A visual representation of how selective search works
Each region proposal is cropped and resized to 224x224 to train a classification model (based on ResNet18).
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet18
from torch.utils.data import Dataset
class RCNN_Dataset(Dataset):
def __init__(self, image, regions, labels, transform=None):
self.image = image
self.regions = regions
self.labels = labels
self.transform = transform
def __len__(self):
return len(self.regions)
def __getitem__(self, idx):
x, y, w, h = self.regions[idx]
cropped = self.image[:, y:y+h, x:x+w] # Crop region proposal
cropped = transforms.Resize((224, 224))(cropped)
label = self.labels[idx]
if self.transform:
cropped = self.transform(cropped)
return cropped, label
# Create a dataset using proposals
labels = [0] * len(regions) # Placeholder: Replace with ground-truth labels
rcnn_dataset = RCNN_Dataset(image, regions, labels, transform=transform)
rcnn_loader = DataLoader(rcnn_dataset, batch_size=16, shuffle=True)
# Define the CNN model (ResNet-based)
class RCNN(nn.Module):
def __init__(self, num_classes=20):
super(RCNN, self).__init__()
self.model = resnet18(pretrained=True)
self.model.fc = nn.Linear(512, num_classes) # Modify final layer
def forward(self, x):
return self.model(x)
# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RCNN(num_classes=20).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Training loop
for epoch in range(5):
for images, labels in rcnn_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(images)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/5], Loss: {loss.item():.4f}")
We add a second bounding box regression head to refine predictions:
class RCNNWithRegressor(nn.Module):
def __init__(self, num_classes=20):
super(RCNNWithRegressor, self).__init__()
self.feature_extractor = resnet18(pretrained=True)
self.feature_extractor.fc = nn.Identity() # Remove last FC layer
self.classifier = nn.Linear(512, num_classes)
self.regressor = nn.Linear(512, 4) # Predicts (dx, dy, dw, dh)
def forward(self, x):
features = self.feature_extractor(x)
class_logits = self.classifier(features)
bbox_deltas = self.regressor(features)
return class_logits, bbox_deltas
# Train classifier and regressor together
model = RCNNWithRegressor(num_classes=20).to(device)
criterion_cls = nn.CrossEntropyLoss()
criterion_bbox = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
for epoch in range(5):
for images, labels in rcnn_loader:
images, labels = images.to(device), labels.to(device)
optimizer.zero_grad()
class_logits, bbox_deltas = model(images)
loss_cls = criterion_cls(class_logits, labels)
loss_bbox = criterion_bbox(bbox_deltas, torch.zeros_like(bbox_deltas)) # Placeholder
loss = loss_cls + loss_bbox
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/5], Classification Loss: {loss_cls.item():.4f}, BBox Loss: {loss_bbox.item():.4f}")
image, _ = dataset[10]
image_cv = cv2.cvtColor(np.array(image.permute(1, 2, 0)), cv2.COLOR_RGB2BGR)
regions = get_region_proposals(image_cv)
model.eval()
for (x, y, w, h) in regions[:50]:
cropped = image[:, y:y+h, x:x+w]
cropped = transforms.Resize((224, 224))(cropped)
cropped = cropped.unsqueeze(0).to(device)
with torch.no_grad():
class_logits, bbox_deltas = model(cropped)
label = torch.argmax(class_logits).item()
if label != 0:
cv2.rectangle(image_cv, (x, y), (x + w, y + h), (255, 0, 0), 2)
plt.imshow(cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB))
plt.show()
Results on Pascal VOC 2007 Dataset
✔ RCNN extracts region proposals and classifies them with CNNs
✔ Bounding box regression improves localization
✔ Unlike YOLO, it is slower but more accurate for small objects
Next Steps: Try and Implement Faster RCNN with Region Proposal Networks (RPN) for real-time performance!