Incorrect VOC format
There are some issues with the XML files, e.g. the image filename/path being incorrect, and I seem to recall a colleague noticing that the image sizes in the XML didn't always match the actual image sizes (and were sometimes 0 maybe?). I've also noticed some of the bounding boxes are invalid (i.e. have xmin==xmax, and ymin==ymax).
Would it be possible to fix these?
Hi, I have encountered this issue as well and have made a quick fix for this two problems:
- Image size = (0,0) (read actual image size from image) I have not noticed any wrong image sizes.
- BBox size invalid (simply skip invalid BBoxes)
My script is to generate labels from VOC annotations into Yolonet readable format. I have checked the correctness of the boxes by looking through the first 500 images of each set, inclusive of problematic ones.
import xml.etree.ElementTree as ET
import os
from os import listdir, getcwd
from os.path import join
from cv2 import imread
sets=[('SCUT_HEAD_Part_A', 'train'), ('SCUT_HEAD_Part_A', 'val'), ('SCUT_HEAD_Part_A', 'test'),
('SCUT_HEAD_Part_B', 'train'), ('SCUT_HEAD_Part_B', 'val'), ('SCUT_HEAD_Part_B', 'test')]
classes = ["person"]
def convert(size, box):
dw = 1./size[0]
dh = 1./size[1]
x = (box[0] + box[1])/2.0
y = (box[2] + box[3])/2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x*dw
w = w*dw
y = y*dh
h = h*dh
return (x, y, w, h)
def convert_annotation(year, image_id):
in_file = open('%s/Annotations/%s.xml'%(year, image_id), 'r', encoding='utf-8')
out_file = open('%s/labels/%s.txt'%(year, image_id), 'w')
tree=ET.parse(in_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
try:
assert w > 0 and h > 0
except AssertionError:
print("WARN: (0,0) size: %s.jpg" % image_id)
im = imread('%s/%s/JPEGImages/%s.jpg' % (wd, year, image_id))
h,w = im.shape[:2]
for obj in root.iter('object'):
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
try:
ww = b[1] - b[0]
hh = b[3] - b[2]
assert ww > 0 and hh > 0
bb = convert((w, h), b)
out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
except AssertionError:
print("WARN: BBox size cannot be 0, skipping BBox in", image_id)
wd = getcwd()
for year, image_set in sets:
if not os.path.exists('%s/labels/'%(year)):
os.makedirs('%s/labels/'%(year))
image_ids = open('%s/ImageSets/Main/%s.txt'%(year, image_set)).read().strip().split()
list_file = open('%s_%s.txt'%(year, image_set), 'w')
for image_id in image_ids:
print(image_id)
list_file.write('%s/%s/JPEGImages/%s.jpg\n'%(wd, year, image_id))
convert_annotation(year, image_id)
list_file.close()
If you want to visualize and manually check through the images with BBoxes, run this script.
import os
import glob
from os import listdir, getcwd
from os.path import join
import cv2
sets = ['A', 'B']
classes = ["person"]
def deconvert(size, box):
dw = size[1]
dh = size[0]
x_cen = box[0]*dw
y_cen = box[1]*dh
w = int(box[2]*dw)
h = int(box[3]*dh)
x = int(x_cen - w/2)
y = int(y_cen - h/2)
return x, y, w, h
wd = getcwd()
for image_set in sets:
label_len = len(glob.glob("SCUT_HEAD_Part_%s/labels/*.txt" % image_set))
for index in range(label_len):
print("Part%s_%05d"% (image_set, index))
image_bboxes = open("SCUT_HEAD_Part_%s/labels/Part%s_%05d.txt" % (image_set, image_set, index), 'r').read().strip().split('\n')
image_bboxes = [list(map(float, i.split())) for i in image_bboxes]
im = cv2.imread('SCUT_HEAD_Part_%s/JPEGImages/Part%s_%05d.jpg' % (image_set, image_set, index))
im_size = im.shape[:2]
for box in image_bboxes:
if len(box) != 0:
x, y, w, h = deconvert(im_size, box[1:])
cv2.rectangle(im, (x, y), (x + w, y + h), (0, 255, 255), 2)
cv2.imshow('frame', im)
k = cv2.waitKey(0)
if k ==32:
continue
else:
break
cv2.destroyAllWindows()
could it be that for the case of bad bbox (xmin = xmax or ymin=ymax) it's because the head is so small, that it is only marked with a singular dot? have any of you tried to visualize what the bad bbox actually is?