获取 Croissant 元数据

数据集查看器会自动为 Hugging Face Hub 上的每个数据集生成 Croissant 格式 (JSON-LD) 的元数据。它列出了数据集的名称、描述、URL 以及数据集作为 Parquet 文件的分布，包括列的元数据。Croissant 元数据适用于所有可以转换为 Parquet 格式的数据集。

什么是 Croissant？

Croissant 是一种基于 schema.org 构建的元数据格式，旨在描述用于机器学习的数据集，以帮助索引、搜索和以编程方式加载它们。

获取元数据

本指南将向您展示如何使用 Hugging Face /croissant 端点来检索与数据集关联的 Croissant 元数据。

/croissant 端点在 URL 中接受数据集名称，例如对于 ibm/duorc 数据集

Python

JavaScript

cURL

在底层，它使用 https://datasets-server.huggingface.co/croissant-crumbs 端点，并使用 Hub 元数据对其进行丰富。

端点响应是一个 JSON-LD，其中包含 Croissant 格式的元数据。例如，ibm/duorc 数据集有两个子集，ParaphraseRC 和 SelfRC（有关拆分和子集的更多详细信息，请参阅列出拆分和子集指南）。元数据链接到它们的 Parquet 文件，并描述了六列中每一列的类型：plot_id、plot、title、question_id、question 和 no_answer

{
  "@context": {
    "@language": "en",
    "@vocab": "https://schema.org/",
    "citeAs": "cr:citeAs",
    "column": "cr:column",
    "conformsTo": "dct:conformsTo",
    "cr": "http://mlcommons.org/croissant/",
    "data": {
      "@id": "cr:data",
      "@type": "@json"
    },
    "dataBiases": "cr:dataBiases",
    "dataCollection": "cr:dataCollection",
    "dataType": {
      "@id": "cr:dataType",
      "@type": "@vocab"
    },
    "dct": "http://purl.org/dc/terms/",
    "extract": "cr:extract",
    "field": "cr:field",
    "fileProperty": "cr:fileProperty",
    "fileObject": "cr:fileObject",
    "fileSet": "cr:fileSet",
    "format": "cr:format",
    "includes": "cr:includes",
    "isLiveDataset": "cr:isLiveDataset",
    "jsonPath": "cr:jsonPath",
    "key": "cr:key",
    "md5": "cr:md5",
    "parentField": "cr:parentField",
    "path": "cr:path",
    "personalSensitiveInformation": "cr:personalSensitiveInformation",
    "recordSet": "cr:recordSet",
    "references": "cr:references",
    "regex": "cr:regex",
    "repeated": "cr:repeated",
    "replace": "cr:replace",
    "sc": "https://schema.org/",
    "separator": "cr:separator",
    "source": "cr:source",
    "subField": "cr:subField",
    "transform": "cr:transform"
  },
  "@type": "sc:Dataset",
  "distribution": [
    {
      "@type": "cr:FileObject",
      "@id": "repo",
      "name": "repo",
      "description": "The Hugging Face git repository.",
      "contentUrl": "https://huggingface.co/datasets/ibm/duorc/tree/refs%2Fconvert%2Fparquet",
      "encodingFormat": "git+https",
      "sha256": "https://github.com/mlcommons/croissant/issues/80"
    },
    {
      "@type": "cr:FileSet",
      "@id": "parquet-files-for-config-ParaphraseRC",
      "name": "parquet-files-for-config-ParaphraseRC",
      "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
      "containedIn": {
        "@id": "repo"
      },
      "encodingFormat": "application/x-parquet",
      "includes": "ParaphraseRC/*/*.parquet"
    },
    {
      "@type": "cr:FileSet",
      "@id": "parquet-files-for-config-SelfRC",
      "name": "parquet-files-for-config-SelfRC",
      "description": "The underlying Parquet files as converted by Hugging Face (see: https://huggingface.co/docs/dataset-viewer/parquet).",
      "containedIn": {
        "@id": "repo"
      },
      "encodingFormat": "application/x-parquet",
      "includes": "SelfRC/*/*.parquet"
    }
  ],
  "recordSet": [
    {
      "@type": "cr:RecordSet",
      "@id": "ParaphraseRC",
      "name": "ParaphraseRC",
      "description": "ibm/duorc - 'ParaphraseRC' subset\n\nAdditional information:\n- 3 splits: train, validation, test\n- 1 skipped column: answers",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/plot_id",
          "name": "ParaphraseRC/plot_id",
          "description": "Column 'plot_id' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "plot_id"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/plot",
          "name": "ParaphraseRC/plot",
          "description": "Column 'plot' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "plot"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/title",
          "name": "ParaphraseRC/title",
          "description": "Column 'title' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "title"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/question_id",
          "name": "ParaphraseRC/question_id",
          "description": "Column 'question_id' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "question_id"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/question",
          "name": "ParaphraseRC/question",
          "description": "Column 'question' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "question"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "ParaphraseRC/no_answer",
          "name": "ParaphraseRC/no_answer",
          "description": "Column 'no_answer' from the Hugging Face parquet file.",
          "dataType": "sc:Boolean",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-ParaphraseRC"
            },
            "extract": {
              "column": "no_answer"
            }
          }
        }
      ]
    },
    {
      "@type": "cr:RecordSet",
      "@id": "SelfRC",
      "name": "SelfRC",
      "description": "ibm/duorc - 'SelfRC' subset\n\nAdditional information:\n- 3 splits: train, validation, test\n- 1 skipped column: answers",
      "field": [
        {
          "@type": "cr:Field",
          "@id": "SelfRC/plot_id",
          "name": "SelfRC/plot_id",
          "description": "Column 'plot_id' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "plot_id"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "SelfRC/plot",
          "name": "SelfRC/plot",
          "description": "Column 'plot' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "plot"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "SelfRC/title",
          "name": "SelfRC/title",
          "description": "Column 'title' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "title"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "SelfRC/question_id",
          "name": "SelfRC/question_id",
          "description": "Column 'question_id' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "question_id"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "SelfRC/question",
          "name": "SelfRC/question",
          "description": "Column 'question' from the Hugging Face parquet file.",
          "dataType": "sc:Text",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "question"
            }
          }
        },
        {
          "@type": "cr:Field",
          "@id": "SelfRC/no_answer",
          "name": "SelfRC/no_answer",
          "description": "Column 'no_answer' from the Hugging Face parquet file.",
          "dataType": "sc:Boolean",
          "source": {
            "fileSet": {
              "@id": "parquet-files-for-config-SelfRC"
            },
            "extract": {
              "column": "no_answer"
            }
          }
        }
      ]
    }
  ],
  "name": "duorc",
  "description": "\n\t\n\t\t\n\t\n\t\n\t\tDataset Card for duorc\n\t\n\n\n\t\n\t\t\n\t\n\t\n\t\tDataset Summary\n\t\n\nThe DuoRC dataset is an English language dataset of questions and answers gathered from crowdsourced AMT workers on Wikipedia and IMDb movie plots. The workers were given freedom to pick answer from the plots or synthesize their own answers. It contains two sub-datasets - SelfRC and ParaphraseRC. SelfRC dataset is built on Wikipedia movie plots solely. ParaphraseRC has questions written from Wikipedia movie plots and the… See the full description on the dataset page: https://huggingface.co/datasets/ibm/duorc.",
  "alternateName": [
    "ibm/duorc",
    "DuoRC"
  ],
  "creator": {
    "@type": "Organization",
    "name": "IBM",
    "url": "https://huggingface.co/ibm"
  },
  "keywords": [
    "question-answering",
    "text2text-generation",
    "abstractive-qa",
    "extractive-qa",
    "crowdsourced",
    "crowdsourced",
    "monolingual",
    "100K<n<1M",
    "10K<n<100K",
    "original",
    "English",
    "mit",
    "Croissant",
    "arxiv:1804.07927",
    "🇺🇸 Region: US"
  ],
  "license": "https://choosealicense.com/licenses/mit/",
  "sameAs": "https://duorc.github.io/",
  "url": "https://huggingface.co/datasets/ibm/duorc"
}

加载数据集

要加载数据集，您可以使用 mlcroissant 库。它提供了一种从 Croissant 元数据加载数据集的简单方法。

< > 在 GitHub 上更新