import json import os import re from collections import defaultdict from tqdm import tqdm from .image_utils import load_labelmap IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".avif", ".heic"} ANNOTATION_EXTENSIONS = {".txt", ".json", ".xml", ".csv", ".jsonl"} LABELMAPS_EXTENSIONS = {".labels", ".yaml", ".yml"} def _patch_sep(filename): """ Replace Windows style slashes to keep filenames consistent. Roboflow depend on it server side. """ return filename.replace("\\", "/") def parsefolder(folder, is_classification=False): folder = _patch_sep(folder).strip().rstrip("/") if not os.path.exists(folder): raise Exception(f"folder does not exist. {folder}") files = _list_files(folder) images = [f for f in files if f["extension"] in IMAGE_EXTENSIONS] _add_indices(images) _decide_split(images) annotations = [f for f in files if f["extension"] in ANNOTATION_EXTENSIONS] labelmaps = [f for f in files if f["extension"] in LABELMAPS_EXTENSIONS] labelmaps = _load_labelmaps(folder, labelmaps) _map_labelmaps_to_annotations(annotations, labelmaps) if not _map_annotations_to_images_1to1(images, annotations): annotations = _loadAnnotations(folder, annotations) _map_annotations_to_images_1tomany(images, annotations) if is_classification: _infer_classification_labels_from_folders(images) return { "location": folder, "images": images, } def _alphanumkey(s): s = os.path.splitext(s)[0] # Split the string into two parts: all characters before the last digit sequence, and the last digit sequence match = re.match(r"(.*?)(\d*)$", s) if match: alpha_part = match.group(1) num_part = match.group(2) num_part = int(num_part) if num_part else 0 return (alpha_part, num_part) else: return (s, 0) def _list_files(folder): filedescriptors = [] for root, dirs, files in os.walk(folder): for file in files: file_path = os.path.join(root, file) rel = os.path.relpath(file_path, folder) filedescriptors.append(_describe_file(f"/{rel}")) filedescriptors = sorted(filedescriptors, key=lambda x: _alphanumkey(x["file"])) return filedescriptors def _add_indices(files): for i, f in enumerate(files): f["index"] = i def _describe_file(f): f = _patch_sep(f) name = f.split("/")[-1] dirname = os.path.dirname(f) fullkey, extension = os.path.splitext(f) fullkey2 = fullkey.replace("/labels", "").replace("/images", "") key = os.path.splitext(name)[0] return { "file": f, "dirname": dirname, "name": name, "extension": extension.lower(), "key": key.lower(), "fullkey": fullkey.lower(), "fullkey2": fullkey2.lower(), } def _map_annotations_to_images_1to1(images, annotations): imgmap = {i["fullkey"]: i for i in images} countmapped = 0 for ann in annotations: image = imgmap.get(ann["fullkey"]) if image: image["annotationfile"] = ann countmapped += 1 if countmapped > 0: return True imgmap = {i["fullkey2"]: i for i in images} for ann in annotations: image = imgmap.get(ann["fullkey2"]) if image: image["annotationfile"] = ann countmapped += 1 return countmapped > 0 def _map_annotations_to_images_1tomany(images, annotationFiles): annotationsByDirname = _list_map(annotationFiles, "dirname") imgRefMap, annotationMap = _build_image_and_annotation_maps(annotationFiles) for image in tqdm(images): dirname = image["dirname"] annotationsInSameDir = annotationsByDirname.get(dirname, []) if annotationsInSameDir: for annotationFile in annotationsInSameDir: format = annotationFile["parsedType"] filtered_annotations = _filterIndividualAnnotations( image, annotationFile, format, imgRefMap, annotationMap ) if filtered_annotations: image["annotationfile"] = filtered_annotations break def _build_image_and_annotation_maps(annotationFiles): imgRefMap = {} annotationMap = defaultdict(list) for annFile in annotationFiles: filename, dirname, parsed, parsedType = ( annFile["file"], annFile["dirname"], annFile["parsed"], annFile["parsedType"], ) if parsedType == "coco": for imageRef in parsed["images"]: imgRefMap[f"{filename}/{imageRef['file_name']}"] = imageRef for annotation in parsed["annotations"]: annotationMap[f"{dirname}/{annotation['image_id']}"].append(annotation) return imgRefMap, annotationMap def _filterIndividualAnnotations(image, annotation, format, imgRefMap, annotationMap): parsed = annotation["parsed"] if format == "coco": imgReference = imgRefMap.get(f"{annotation['file']}/{image['name']}") if imgReference: # workaround to make Annotations.js correctly identify this as coco in the backend fake_annotation = { "id": 999999999, "image_id": 999999999, "category_id": 0, "area": 1, "segmentation": [], "iscrowd": 0, } _annotation = {"name": "annotation.coco.json"} annotations_for_image = annotationMap.get(f"{image['dirname']}/{imgReference['id']}", []) _annotation["rawText"] = json.dumps( { "info": parsed["info"], "licenses": parsed["licenses"], "categories": parsed["categories"], "images": [imgReference], "annotations": annotations_for_image or [fake_annotation], } ) return _annotation elif format == "createml": imgReferences = [i for i in parsed if i["image"] == image["name"]] if len(imgReferences) > 1: print(f"warning: found multiple image entries for image {image['file']} in {annotation['file']}") if imgReferences: imgReference = imgReferences[0] _annotation = { "name": "annotation.createml.json", "rawText": json.dumps([imgReference]), } return _annotation elif format == "csv": imgLines = [ld["line"] for ld in parsed["lines"] if ld["file_name"] == image["name"]] if imgLines: headers = parsed["headers"] _annotation = { "name": "annotation.csv", "rawText": "".join([headers] + imgLines), } return _annotation else: return None elif format == "multilabel_csv": rows = [r for r in parsed["rows"] if r["file_name"] == image["name"]] if rows: labels = rows[0]["labels"] return {"type": "classification_multilabel", "labels": labels} else: return None elif format == "jsonl": jsonlLines = [json.dumps(line) for line in parsed if line["image"] == image["name"]] if jsonlLines: _annotation = {"name": "annotation.jsonl", "rawText": "\n".join(jsonlLines)} return _annotation return None def _loadAnnotations(folder, annotations): valid_extensions = {".json", ".csv", ".jsonl"} annotations = [a for a in annotations if a["extension"] in valid_extensions] for ann in annotations: extension = ann["extension"] if extension == ".json": with open(f"{folder}{ann['file']}") as f: parsed = json.load(f) parsedType = _guessAnnotationFileFormat(parsed, extension) if parsedType: ann["parsed"] = parsed ann["parsedType"] = parsedType elif extension == ".jsonl": ann["parsed"] = _read_jsonl(f"{folder}{ann['file']}") ann["parsedType"] = "jsonl" elif extension == ".csv": parsed = _parseAnnotationCSV(f"{folder}{ann['file']}") ann["parsed"] = parsed ann["parsedType"] = parsed.get("type", "csv") return annotations def _read_jsonl(path): data = [] with open(path) as file: for linenum, line in enumerate(file, 1): if not line: continue try: json_object = json.loads(line.strip()) data.append(json_object) except json.JSONDecodeError: print(f"Warning: Skipping invalid JSON line in {path}:{linenum}") return data def _parseAnnotationCSV(filename): # TODO: use a proper CSV library? with open(filename) as f: lines = f.readlines() headers = [h.strip() for h in lines[0].split(",")] # Multi-label classification csv typically named _classes.csv if os.path.basename(filename) == "_classes.csv": parsed_lines = [] for line in lines[1:]: parts = [p.strip() for p in line.split(",")] file_name = parts[0] labels = [headers[i] for i, v in enumerate(parts[1:], start=1) if v == "1"] parsed_lines.append({"file_name": file_name, "labels": labels}) return {"type": "multilabel_csv", "rows": parsed_lines, "headers": headers} header_line = lines[0] lines = [{"file_name": ld.split(",")[0].strip(), "line": ld} for ld in lines[1:]] return { "headers": header_line, "lines": lines, } def _guessAnnotationFileFormat(parsed, extension): if extension == ".json": if isinstance(parsed, dict): if isinstance(parsed.get("annotations"), list) and isinstance(parsed.get("images"), list): return "coco" elif isinstance(parsed, list): return "createml" return None def _map_labelmaps_to_annotations(annotations, labelmaps): if not labelmaps: return labelmapmap = {lm["dirname"]: lm for lm in labelmaps} rootLabelmap = labelmapmap.get("/") if len(labelmapmap) < len(labelmaps): print("warning: unexpectedly found multiple labelmaps per directory") print([lm["file"] for lm in labelmaps]) for ann in annotations: labelmap = labelmapmap.get(ann["dirname"]) or rootLabelmap if labelmap: ann["labelmap"] = labelmap["labelmap"] def _load_labelmaps(folder, labelmaps): for labelmap in labelmaps: try: labelmap["labelmap"] = load_labelmap(f"{folder}{labelmap['file']}") except Exception: # raise Exception(f"failed to load labelmap {labelmap['file']}") pass return [lm for lm in labelmaps if lm.get("labelmap")] def _decide_split(images): for i in images: fullkey = i["fullkey"] if "valid" in fullkey: i["split"] = "valid" elif "train" in fullkey: i["split"] = "train" elif "test" in fullkey: i["split"] = "test" else: i["split"] = "train" def _list_map(my_list, key): d = {} for i in my_list: d.setdefault(i[key], []).append(i) return d def _infer_classification_labels_from_folders(images): for image in images: if image.get("annotationfile"): continue dirname = image.get("dirname", "").strip("/") if not dirname or dirname == ".": # Skip images in root directory or invalid paths continue class_name = os.path.basename(dirname) if class_name and class_name != ".": image["annotationfile"] = {"classification_label": class_name, "type": "classification_folder"}