Source code for segments.huggingface

from __future__ import annotations

import json
import logging
import os
import tempfile
from string import Template
from typing import TYPE_CHECKING, Any, Dict, cast

import requests
from PIL import Image
from segments.utils import load_image_from_url, load_label_bitmap_from_url


# https://adamj.eu/tech/2021/05/13/python-type-hints-how-to-fix-circular-imports/
if TYPE_CHECKING:
    from segments.typing import Release


#############
# Variables #
#############
logger = logging.getLogger(__name__)
try:
    import datasets
    from huggingface_hub import HfApi
except ImportError:
    logger.error("Please install HuggingFace datasets first: pip install --upgrade datasets")

# Add some functionality to the push_to_hub function of datasets.Dataset
push_to_hub_original = datasets.Dataset.push_to_hub

hf_api = HfApi()


#############
# Functions #
#############
def push_to_hub(self: datasets.Dataset, repo_id: str, *args: Any, **kwargs: Any) -> None:
    push_to_hub_original(self, repo_id, *args, **kwargs)

    # Upload the label file (https://huggingface.co/datasets/huggingface/label-files)
    if hasattr(self, "id2label"):
        # print("Uploading id2label.json")
        tmpfile = os.path.join(tempfile.gettempdir(), "id2label.json")
        with open(tmpfile, "w") as f:
            json.dump(self.id2label, f)

        hf_api.upload_file(
            path_or_fileobj=tmpfile,
            path_in_repo="id2label.json",
            repo_id=repo_id,
            repo_type="dataset",
        )

    # Upload README.md
    if hasattr(self, "readme"):
        # print("Uploading README.md")
        tmpfile = os.path.join(tempfile.gettempdir(), "README.md")
        with open(tmpfile, "w") as f:
            f.write(self.readme)

        hf_api.upload_file(
            path_or_fileobj=tmpfile,
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="dataset",
        )


datasets.Dataset.push_to_hub = push_to_hub


def get_taxonomy_table(taxonomy: Dict[str, Any]) -> str:
    markdown_table = ""
    for category in taxonomy["categories"]:
        id_ = category["id"]
        name = category["name"]
        description = category["description"] if "description" in category else "-"
        markdown_table += f"| {id_} | {name} | {description} |\n"
    return markdown_table


[docs]def release2dataset(release: Release, download_images: bool = True) -> datasets.Dataset: """Create a Huggingface dataset from a release. Args: release: A Segments release resulting from :meth:`.get_release`. download_images: If images need to be downloaded from an AWS S3 url. Defaults to :obj:`True`. Returns: A HuggingFace dataset. Raises: :exc:`ValueError`: If the type of dataset is not yet supported. """ # try: # import datasets # except ImportError as e: # logger.error( # "Please install HuggingFace datasets first: pip install --upgrade datasets" # ) # raise e content = requests.get( cast(str, release.attributes.url) # TODO Fix in the backend. ) release_dict = json.loads(content.content) task_type = release_dict["dataset"]["task_type"] if task_type in ["vector", "bboxes", "keypoint"]: features = datasets.Features( { "name": datasets.Value("string"), "uuid": datasets.Value("string"), "image": {"url": datasets.Value("string")}, "status": datasets.Value("string"), "label": { "annotations": [ { "id": datasets.Value("int32"), "category_id": datasets.Value("int32"), "type": datasets.Value("string"), "points": [[datasets.Value("float32")]], } ], }, } ) elif task_type in ["segmentation-bitmap", "segmentation-bitmap-highres"]: features = datasets.Features( { "name": datasets.Value("string"), "uuid": datasets.Value("string"), "image": {"url": datasets.Value("string")}, "status": datasets.Value("string"), "label": { "annotations": [ { "id": datasets.Value("int32"), "category_id": datasets.Value("int32"), } ], "segmentation_bitmap": {"url": datasets.Value("string")}, }, } ) elif task_type in ["text-named-entities", "text-span-categorization"]: features = datasets.Features( { "name": datasets.Value("string"), "uuid": datasets.Value("string"), "text": datasets.Value("string"), "status": datasets.Value("string"), "label": { "annotations": [ { "start": datasets.Value("int32"), "end": datasets.Value("int32"), "category_id": datasets.Value("int32"), } ], }, } ) else: raise ValueError("This type of dataset is not yet supported.") samples = release_dict["dataset"]["samples"] data_rows = [] for sample in samples: try: del sample["labels"]["ground-truth"]["attributes"]["format_version"] except (KeyError, TypeError): pass data_row: Dict[str, Any] = {} # Name data_row["name"] = sample["name"] # Uuid data_row["uuid"] = sample["uuid"] # Status try: data_row["status"] = sample["labels"]["ground-truth"]["label_status"] except (KeyError, TypeError): data_row["status"] = "UNLABELED" # Image or text if task_type in [ "vector", "bboxes", "keypoint", "segmentation-bitmap", "segmentation-bitmap-highres", ]: try: data_row["image"] = sample["attributes"]["image"] except (KeyError, TypeError): data_row["image"] = {"url": None} elif task_type in ["text-named-entities", "text-span-categorization"]: try: data_row["text"] = sample["attributes"]["text"] except (KeyError, TypeError): data_row["text"] = None # Label try: label = sample["labels"]["ground-truth"]["attributes"] # Remove the image-level attributes if "attributes" in label: del label["attributes"] # Remove the object-level attributes for annotation in label["annotations"]: if "attributes" in annotation: del annotation["attributes"] data_row["label"] = label except (KeyError, TypeError): error_label: Dict[str, Any] = {"annotations": []} if task_type in ["segmentation-bitmap", "segmentation-bitmap-highres"]: error_label["segmentation_bitmap"] = {"url": None} data_row["label"] = error_label data_rows.append(data_row) # Now transform to column format dataset_dict: Dict[str, Any] = {key: [] for key in features.keys()} for data_row in data_rows: for key in dataset_dict.keys(): dataset_dict[key].append(data_row[key]) # Create the HF Dataset and flatten it dataset = datasets.Dataset.from_dict(dataset_dict, features, split="train") dataset = dataset.flatten() # Optionally download the images if ( task_type in [ "vector", "bboxes", "keypoint", "segmentation-bitmap", "segmentation-bitmap-highres", ] and download_images ): def download_image(data_row: Dict[str, Any]) -> Dict[str, Any]: try: data_row["image"] = load_image_from_url(data_row["image.url"]) except Exception: data_row["image"] = None return data_row def download_segmentation_bitmap(data_row: Dict[str, Any]) -> Dict[str, Any]: try: segmentation_bitmap = load_label_bitmap_from_url(data_row["label.segmentation_bitmap.url"]) data_row["label.segmentation_bitmap"] = Image.fromarray(segmentation_bitmap) except Exception: data_row["label.segmentation_bitmap"] = Image.new("RGB", (1, 1)) # TODO: replace with None return data_row dataset = dataset.map(download_image, remove_columns=["image.url"]) if task_type in ["segmentation-bitmap", "segmentation-bitmap-highres"]: dataset = dataset.map( download_segmentation_bitmap, remove_columns=["label.segmentation_bitmap.url"], ) # Reorder the features features = datasets.Features( { "name": dataset.features["name"], "uuid": dataset.features["uuid"], "status": dataset.features["status"], "image": datasets.Image(), "label.annotations": dataset.features["label.annotations"], "label.segmentation_bitmap": datasets.Image(), } ) dataset.info.features = features else: # Reorder the features features = datasets.Features( { "name": dataset.features["name"], "uuid": dataset.features["uuid"], "status": dataset.features["status"], "image": datasets.Image(), "label.annotations": dataset.features["label.annotations"], } ) dataset.info.features = features # Create id2label id2label = {} for category in release_dict["dataset"]["task_attributes"]["categories"]: id2label[category["id"]] = category["name"] id2label[0] = "unlabeled" dataset.id2label = id2label # Create readme.md and update DatasetInfo # https://stackoverflow.com/questions/6385686/is-there-a-native-templating-system-for-plain-text-files-in-python task_type = release_dict["dataset"]["task_type"] if task_type in ["segmentation-bitmap", "segmentation-bitmap-highres"]: task_category = "image-segmentation" elif task_type in ["vector", "bboxes"]: task_category = "object-detection" elif task_type in ["text-named-entities", "text-span-categorization"]: task_category = "named-entity-recognition" else: task_category = "other" info = { "name": release_dict["dataset"]["name"], "segments_url": f'https://segments.ai/{release_dict["dataset"]["owner"]}/{release_dict["dataset"]["name"]}', "short_description": release_dict["dataset"]["description"], "release": release_dict["name"], "taxonomy_table": get_taxonomy_table(release_dict["dataset"]["task_attributes"]), "task_category": task_category, } # Create readme.md with open(os.path.join(os.path.dirname(__file__), "data", "dataset_card_template.md"), "r") as f: template = Template(f.read()) readme = template.substitute(info) dataset.readme = readme # Update DatasetInfo dataset.info.description = info["short_description"] dataset.info.homepage = info["segments_url"] return dataset