{ "openapi": "3.1.0", "info": { "title": "Dataset viewer server API", "description": "The 🤗 dataset viewer API gives access to the contents, metadata and basic statistics of the Hugging Face Hub datasets.", "contact": { "name": "API Support", "email": "website@huggingface.co", "url": "https://github.com/huggingface/dataset-viewer/" }, "version": "1.0", "license": { "name": "Apache 2.0", "url": "https://www.apache.org/licenses/LICENSE-2.0" } }, "tags": [ { "name": "datasets", "description": "API to access datasets" } ], "servers": [ { "url": "https://datasets-server.huggingface.co", "description": "Production server" } ], "components": { "headers": { "Access-Control-Allow-Origin": { "description": "Indicates whether the response can be shared with requesting code from the given origin.", "schema": { "type": "string" }, "example": "*", "required": true }, "Cache-Control": { "description": "Directives that control caching in browsers and shared caches. This directive is used internally for caching the responses: the API will serve the same response until the cache has expired.", "schema": { "type": "string" }, "examples": { "no-cache": { "summary": "No cache.", "value": "no-cache" }, "max-age": { "summary": "Cache TTL.", "value": "max-age=120" } }, "required": true }, "X-Error-Code-401": { "description": "A string that identifies the underlying error for 401.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-ExternalUnauthenticatedError" } ] }, "required": true }, "X-Error-Code-404": { "description": "A string that identifies the underlying error for 404.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-ExternalAuthenticatedError" }, { "$ref": "#/components/schemas/X-Error-Code-ResponseNotFound" } ] }, "required": true }, "X-Error-Code-422": { "description": "A string that identifies the underlying error for 422.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-MissingRequiredParameter" } ] }, "required": true }, "X-Error-Code-500": { "description": "A string that identifies the underlying error for 500. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" }, { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, "required": false }, "X-Error-Code-500-first-rows": { "description": "A string that identifies the underlying error for 500 on /first-rows. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" }, { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, { "$ref": "#/components/schemas/X-Error-Code-StreamingRowsError" } ] }, "required": false }, "X-Error-Code-500-is-valid": { "description": "A string that identifies the underlying error for 500 on /is-valid. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, "required": false }, "X-Error-Code-500-common": { "description": "A string that identifies the underlying error for 500 on /parquet. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-ResponseNotReadyError" }, { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" } ] }, "required": false }, "X-Error-Code-500-rows": { "description": "A string that identifies the underlying error for 500 on /rows. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, { "$ref": "#/components/schemas/X-Error-Code-RowsPostProcessingError" } ] }, "required": false }, "X-Error-Code-500-search": { "description": "A string that identifies the underlying error for 500 on /search. It's marked as required: false because the header can be missing on text-plain response.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-UnexpectedError" }, { "$ref": "#/components/schemas/X-Error-Code-RowsPostProcessingError" } ] }, "required": false }, "X-Error-Code-501": { "description": "A string that identifies the underlying error for 501.", "schema": { "oneOf": [ { "$ref": "#/components/schemas/X-Error-Code-DatasetInBlockListError" }, { "$ref": "#/components/schemas/X-Error-Code-DatasetWithTooManyConfigsError" } ] }, "required": true } }, "schemas": { "ServerErrorResponse": { "type": "string", "example": "Internal Server Error" }, "ConfigItem": { "type": "object", "required": ["dataset", "config"], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" } } }, "ConfigItems": { "type": "array", "items": { "$ref": "#/components/schemas/ConfigItem" } }, "FailedConfigItem": { "type": "object", "required": ["dataset", "config", "error"], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "error": { "$ref": "#/components/schemas/CustomError" } } }, "FailedConfigItems": { "type": "array", "items": { "$ref": "#/components/schemas/FailedConfigItem" } }, "SplitsResponse": { "type": "object", "required": ["splits"], "properties": { "splits": { "type": "array", "items": { "$ref": "#/components/schemas/SplitItem" } }, "pending": { "$ref": "#/components/schemas/ConfigItems" }, "failed": { "$ref": "#/components/schemas/FailedConfigItems" } } }, "SplitItem": { "type": "object", "required": ["dataset", "config", "split"], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "split": { "type": "string" } } }, "CustomError": { "type": "object", "required": ["error"], "properties": { "error": { "type": "string" }, "cause_exception": { "type": "string" }, "cause_message": { "type": "string" }, "cause_traceback": { "type": "array", "items": { "type": "string" } } } }, "FirstRowsResponse": { "type": "object", "required": [ "dataset", "config", "split", "features", "rows", "truncated" ], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "split": { "type": "string" }, "features": { "type": "array", "items": { "$ref": "#/components/schemas/FeatureItem" } }, "rows": { "type": "array", "items": { "$ref": "#/components/schemas/RowItem" } }, "truncated": { "type": "boolean" } } }, "PaginatedResponse": { "type": "object", "required": [ "features", "rows", "num_rows_total", "num_rows_per_page", "partial" ], "properties": { "features": { "type": "array", "items": { "$ref": "#/components/schemas/FeatureItem" } }, "rows": { "type": "array", "items": { "$ref": "#/components/schemas/RowItem" } }, "num_rows_total": { "type": "integer" }, "num_rows_per_page": { "type": "integer" }, "partial": { "type": "boolean" } } }, "FeatureItem": { "type": "object", "required": ["feature_idx", "name", "type"], "properties": { "feature_idx": { "type": "integer" }, "name": { "type": "string" }, "type": { "$ref": "#/components/schemas/Feature" } } }, "Feature": { "oneOf": [ { "$ref": "#/components/schemas/ValueFeature" }, { "$ref": "#/components/schemas/ClassLabelFeature" }, { "$ref": "#/components/schemas/ArrayXDFeature" }, { "$ref": "#/components/schemas/TranslationFeature" }, { "$ref": "#/components/schemas/TranslationVariableLanguagesFeature" }, { "$ref": "#/components/schemas/JsonFeature" }, { "$ref": "#/components/schemas/SequenceFeature" }, { "$ref": "#/components/schemas/LargeListFeature" }, { "$ref": "#/components/schemas/DictFeature" }, { "$ref": "#/components/schemas/ListFeature" }, { "$ref": "#/components/schemas/LegacyListFeature" }, { "$ref": "#/components/schemas/AudioFeature" }, { "$ref": "#/components/schemas/ImageFeature" }, { "$ref": "#/components/schemas/VideoFeature" }, { "$ref": "#/components/schemas/PdfFeature" } ] }, "ValueFeature": { "type": "object", "required": ["_type", "dtype"], "properties": { "_type": { "type": "string", "enum": ["Value"] }, "dtype": { "type": "string", "enum": [ "null", "bool", "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float16", "float32 (alias float)", "float64 (alias double)", "time32[(s|ms)]", "time64[(us|ns)]", "timestamp[(s|ms|us|ns)]", "timestamp[(s|ms|us|ns), tz=(tzstring)]", "date32", "date64", "duration[(s|ms|us|ns)]", "decimal128(precision, scale)", "decimal256(precision, scale)", "binary", "large_binary", "string", "large_string" ] } } }, "ClassLabelFeature": { "type": "object", "required": ["_type", "names"], "properties": { "_type": { "type": "string", "enum": ["ClassLabel"] }, "names": { "type": "array", "items": { "type": "string" } } } }, "ArrayXDFeature": { "type": "object", "required": ["_type", "shape"], "properties": { "_type": { "type": "string", "enum": ["Array2D", "Array3D", "Array4D", "Array5D"] }, "shape": { "type": "array", "items": { "type": "integer", "nullable": true } } } }, "TranslationFeature": { "type": "object", "required": ["_type", "languages"], "properties": { "_type": { "type": "string", "enum": ["Translation"] }, "languages": { "type": "array", "items": { "type": "string" } } } }, "TranslationVariableLanguagesFeature": { "type": "object", "required": ["_type", "languages"], "properties": { "_type": { "type": "string", "enum": ["TranslationVariableLanguages"] }, "num_languages": { "type": "integer" }, "languages": { "type": "array", "items": { "type": "string" } } } }, "JsonFeature": { "type": "object", "required": ["_type"], "properties": { "_type": { "type": "string", "enum": ["Json"] }, "decode": { "type": "boolean" } } }, "SequenceFeature": { "type": "object", "required": ["_type", "feature"], "properties": { "_type": { "type": "string", "enum": ["Sequence"] }, "length": { "type": "integer" }, "feature": { "$ref": "#/components/schemas/Feature" } } }, "ListFeature": { "type": "object", "required": ["_type", "feature"], "properties": { "_type": { "type": "string", "enum": ["List"] }, "feature": { "$ref": "#/components/schemas/Feature" } } }, "LargeListFeature": { "type": "object", "required": ["_type", "feature"], "properties": { "_type": { "type": "string", "enum": ["LargeList"] }, "feature": { "$ref": "#/components/schemas/Feature" } } }, "DictFeature": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/Feature" } }, "LegacyListFeature": { "type": "array", "items": { "$ref": "#/components/schemas/Feature" } }, "AudioFeature": { "type": "object", "required": ["_type", "sampling_rate"], "properties": { "_type": { "type": "string", "enum": ["Audio"] }, "sampling_rate": { "type": "number" }, "mono": { "type": "boolean" }, "decode": { "type": "boolean" } } }, "ImageFeature": { "type": "object", "required": ["_type"], "properties": { "_type": { "type": "string", "enum": ["Image"] }, "decode": { "type": "boolean" } } }, "PdfFeature": { "type": "object", "required": ["_type"], "properties": { "_type": { "type": "string", "enum": ["Pdf"] }, "decode": { "type": "boolean" } } }, "VideoFeature": { "type": "object", "required": ["_type"], "properties": { "_type": { "type": "string", "enum": ["Video"] }, "decode": { "type": "boolean" } } }, "RowItem": { "type": "object", "required": ["row_idx", "row", "truncated_cells"], "properties": { "row_idx": { "type": "integer" }, "row": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/Cell" } }, "truncated_cells": { "type": "array", "items": { "type": "string" } } } }, "Cell": { "oneOf": [ { "$ref": "#/components/schemas/ValueCell" }, { "$ref": "#/components/schemas/ClassLabelCell" }, { "$ref": "#/components/schemas/Array2DCell" }, { "$ref": "#/components/schemas/Array3DCell" }, { "$ref": "#/components/schemas/Array4DCell" }, { "$ref": "#/components/schemas/Array5DCell" }, { "$ref": "#/components/schemas/TranslationCell" }, { "$ref": "#/components/schemas/TranslationVariableLanguagesCell" }, { "$ref": "#/components/schemas/JsonCell" }, { "$ref": "#/components/schemas/SequenceCell" }, { "$ref": "#/components/schemas/DictCell" }, { "$ref": "#/components/schemas/ListCell" }, { "$ref": "#/components/schemas/AudioCell" }, { "$ref": "#/components/schemas/ImageCell" }, { "$ref": "#/components/schemas/NullableImagesListCell" }, { "$ref": "#/components/schemas/VideoCell" }, { "$ref": "#/components/schemas/PdfCell" } ] }, "ValueCell": { "oneOf": [ { "type": "boolean" }, { "type": "integer" }, { "type": "number" }, { "type": "string" } ], "nullable": true }, "ClassLabelCell": { "type": "integer" }, "Array2DCell": { "type": "array", "items": { "type": "array", "items": { "type": "number" } } }, "Array3DCell": { "type": "array", "items": { "$ref": "#/components/schemas/Array2DCell" } }, "Array4DCell": { "type": "array", "items": { "$ref": "#/components/schemas/Array3DCell" } }, "Array5DCell": { "type": "array", "items": { "$ref": "#/components/schemas/Array4DCell" } }, "TranslationCell": { "type": "object", "additionalProperties": { "type": "string" } }, "TranslationVariableLanguagesCell": { "type": "object", "required": ["language", "translation"], "properties": { "language": { "type": "array", "items": { "type": "string" } }, "translation": { "type": "array", "items": { "type": "string" } } } }, "JsonCell": { "oneOf": [ { "$ref": "#/components/schemas/ValueCell" }, { "$ref": "#/components/schemas/ListCell" }, { "$ref": "#/components/schemas/DictCell" } ] }, "SequenceCell": { "oneOf": [ { "$ref": "#/components/schemas/ListCell" }, { "$ref": "#/components/schemas/DictionaryOfListsCell" } ] }, "ListCell": { "type": "array", "items": { "$ref": "#/components/schemas/Cell" } }, "DictionaryOfListsCell": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/ListCell" } }, "DictCell": { "type": "object", "additionalProperties": { "$ref": "#/components/schemas/Cell" } }, "AudioCell": { "type": "array", "items": { "type": "object", "required": ["src", "type"], "properties": { "src": { "type": "string", "format": "uri" }, "type": { "type": "string" } } } }, "ImageCell": { "type": "object", "properties": { "src": { "type": "string", "format": "uri" }, "height": { "type": "integer" }, "width": { "type": "integer" } }, "required": ["src"] }, "NullableImagesListCell": { "type": "array", "items": { "oneOf": [ { "$ref": "#/components/schemas/ImageCell" }, { "type": "null" } ] } }, "PdfCell": { "type": "object", "properties": { "src": { "type": "string", "format": "uri" }, "thumbnail":{ "$ref": "#/components/schemas/ImageCell" }, "size_bytes": { "type": "integer" } }, "required": ["src", "thumbnail", "size_bytes"] }, "VideoCell": { "type": "object", "properties": { "src": { "type": "string", "format": "uri" } }, "required": ["src"] }, "IsValidResponse": { "type": "object", "required": ["preview", "viewer", "search", "filter", "statistics"], "properties": { "viewer": { "type": "boolean" }, "preview": { "type": "boolean" }, "search": { "type": "boolean" }, "filter": { "type": "boolean" }, "statistics": { "type": "boolean" } } }, "Job": { "type": "object", "required": ["dataset", "config", "split", "kind"], "properties": { "dataset": { "type": "string" }, "kind": { "type": "string" }, "config": { "type": "string" }, "split": { "anyOf": [{ "type": "string" }, { "type": "null" }] } } }, "Jobs": { "type": "array", "items": { "$ref": "#/components/schemas/Job" } }, "ParquetResponse": { "type": "object", "required": ["parquet_files", "partial"], "properties": { "parquet_files": { "type": "array", "items": { "$ref": "#/components/schemas/SplitHubFile" } }, "features": { "type": "object" }, "pending": { "$ref": "#/components/schemas/Jobs" }, "failed": { "$ref": "#/components/schemas/Jobs" }, "partial": { "$ref": "#/components/schemas/Partial" } } }, "SplitHubFile": { "type": "object", "required": ["dataset", "config", "split", "url", "filename", "size"], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "split": { "type": "string" }, "url": { "type": "string", "format": "uri" }, "filename": { "type": "string" }, "size": { "type": "integer" } } }, "InfoResponse": { "type": "object", "required": ["dataset_info", "partial"], "properties": { "dataset_info": { "type": "object", "description": "A dump of the DatasetInfo object from the datasets library. See https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetInfo. We don't describe the contents of these metadata for now." }, "pending": { "$ref": "#/components/schemas/Jobs" }, "failed": { "$ref": "#/components/schemas/Jobs" }, "partial": { "$ref": "#/components/schemas/Partial" } } }, "Partial": { "type": "boolean", "description": "True means that the response has been computed on part of the dataset (typically the first 5GB). False means that the complete dataset was used." }, "DatasetSize": { "type": "object", "required": [ "dataset", "num_bytes_parquet_files", "num_bytes_memory", "num_rows", "estimated_num_rows" ], "properties": { "dataset": { "type": "string" }, "num_bytes_original_files": { "type": "integer" }, "num_bytes_parquet_files": { "type": "integer" }, "num_bytes_memory": { "type": "integer" }, "num_rows": { "type": "integer" }, "estimated_num_rows": { "anyOf": [{ "type": "integer" }, { "type": "null" }] } } }, "ConfigSize": { "type": "object", "required": [ "dataset", "config", "num_bytes_parquet_files", "num_bytes_memory", "num_rows", "num_columns", "estimated_num_rows" ], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "num_bytes_original_files": { "type": "integer" }, "num_bytes_parquet_files": { "type": "integer" }, "num_bytes_memory": { "type": "integer" }, "num_rows": { "type": "integer" }, "num_columns": { "type": "integer" }, "estimated_num_rows": { "anyOf": [{ "type": "integer" }, { "type": "null" }] } } }, "SplitSize": { "type": "object", "required": [ "dataset", "config", "split", "num_bytes_parquet_files", "num_bytes_memory", "num_rows", "num_columns", "estimated_num_rows" ], "properties": { "dataset": { "type": "string" }, "config": { "type": "string" }, "split": { "type": "string" }, "num_bytes_parquet_files": { "type": "integer" }, "num_bytes_memory": { "type": "integer" }, "num_rows": { "type": "integer" }, "num_columns": { "type": "integer" }, "estimated_num_rows": { "anyOf": [{ "type": "integer" }, { "type": "null" }] } } }, "DatasetSizeResponse": { "type": "object", "required": ["size", "pending", "failed", "partial"], "properties": { "size": { "type": "object", "required": ["dataset", "configs", "splits"], "properties": { "dataset": { "$ref": "#/components/schemas/DatasetSize" }, "configs": { "type": "array", "items": { "$ref": "#/components/schemas/ConfigSize" } }, "splits": { "type": "array", "items": { "$ref": "#/components/schemas/SplitSize" } } } }, "pending": { "$ref": "#/components/schemas/Jobs" }, "failed": { "$ref": "#/components/schemas/Jobs" }, "partial": { "$ref": "#/components/schemas/Partial" } } }, "ConfigSizeResponse": { "type": "object", "required": ["size", "partial"], "properties": { "size": { "type": "object", "required": ["config", "splits"], "properties": { "config": { "$ref": "#/components/schemas/ConfigSize" }, "splits": { "type": "array", "items": { "$ref": "#/components/schemas/SplitSize" } } } }, "partial": { "$ref": "#/components/schemas/Partial" } } }, "SizeResponse": { "oneOf": [ { "$ref": "#/components/schemas/DatasetSizeResponse" }, { "$ref": "#/components/schemas/ConfigSizeResponse" } ] }, "OptInOutUrlsCountResponse": { "type": "object", "required": [ "urls_columns", "num_opt_in_urls", "num_opt_out_urls", "num_urls", "num_scanned_rows", "has_urls_columns", "full_scan" ], "properties": { "urls_columns": { "type": "array", "items": { "type": "string" } }, "num_opt_in_urls": { "type": "integer" }, "num_opt_out_urls": { "type": "integer" }, "num_urls": { "type": "integer" }, "num_scanned_rows": { "type": "integer" }, "has_urls_columns": { "type": "boolean" }, "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] } } }, "PresidioEntitiesCountResponse": { "type": "object", "required": [ "scanned_columns", "num_rows_with_person_entities", "num_rows_with_phone_number_entities", "num_rows_with_email_address_entities", "num_rows_with_sensitive_pii", "num_scanned_rows", "has_scanned_columns" ], "properties": { "scanned_columns": { "type": "array", "items": { "type": "string" } }, "num_rows_with_person_entities": { "type": "integer" }, "num_rows_with_phone_number_entities": { "type": "integer" }, "num_rows_with_email_address_entities": { "type": "integer" }, "num_rows_with_sensitive_pii": { "type": "integer" }, "num_scanned_rows": { "type": "integer" }, "has_scanned_columns": { "type": "boolean" }, "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] } } }, "ColumnType": { "type": "string", "enum": [ "float", "int", "class_label", "string_label", "string_text", "bool", "list", "audio", "image", "datetime" ] }, "Histogram": { "type": "object", "required": ["hist", "bin_edges"], "properties": { "hist": { "type": "array", "items": { "type": "integer" } }, "bin_edges": { "type": "array", "items": { "type": "number" } } } }, "DatetimeHistogram": { "type": "object", "required": ["hist", "bin_edges"], "properties": { "hist": { "type": "array", "items": { "type": "integer" } }, "bin_edges": { "type": "array", "items": { "type": "string" } } } }, "NumericalStatisticsItem": { "type": "object", "required": [ "nan_count", "nan_proportion", "min", "max", "mean", "median", "std", "histogram" ], "properties": { "nan_count": { "type": "integer" }, "nan_proportion": { "type": "number" }, "min": { "oneOf": [{ "type": "number" }, { "type": "null" }] }, "max": { "oneOf": [{ "type": "number" }, { "type": "null" }] }, "mean": { "oneOf": [{ "type": "number" }, { "type": "null" }] }, "median": { "oneOf": [{ "type": "number" }, { "type": "null" }] }, "std": { "oneOf": [{ "type": "number" }, { "type": "null" }] }, "histogram": { "oneOf": [ { "$ref": "#/components/schemas/Histogram" }, { "type": "null" } ] } } }, "DatetimeStatisticsItem": { "type": "object", "required": [ "nan_count", "nan_proportion", "min", "max", "mean", "median", "std", "histogram" ], "properties": { "nan_count": { "type": "integer" }, "nan_proportion": { "type": "number" }, "min": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, "max": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, "mean": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, "median": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, "std": { "oneOf": [{ "type": "string" }, { "type": "null" }] }, "histogram": { "oneOf": [ { "$ref": "#/components/schemas/DatetimeHistogram" }, { "type": "null" } ] } } }, "CategoricalStatisticsItem": { "type": "object", "description": "note that fields 'no_label_count' and 'no_label_proportion' are not required, because some old entries still miss them, and we don't want to recompute all of them. See https://github.com/huggingface/dataset-viewer/issues/2573.", "required": ["nan_count", "nan_proportion", "n_unique", "frequencies"], "properties": { "nan_count": { "type": "integer" }, "nan_proportion": { "type": "number" }, "no_label_count": { "type": "integer" }, "no_label_proportion": { "type": "number" }, "n_unique": { "type": "integer" }, "frequencies": { "type": "object", "additionalProperties": { "type": "integer" } } } }, "BoolStatisticsItem": { "type": "object", "required": ["nan_count", "nan_proportion", "frequencies"], "properties": { "nan_count": { "type": "integer" }, "nan_proportion": { "type": "number" }, "frequencies": { "type": "object", "additionalProperties": { "type": "integer" } } } }, "SupportedStatistics": { "anyOf": [ { "$ref": "#/components/schemas/NumericalStatisticsItem" }, { "$ref": "#/components/schemas/DatetimeStatisticsItem" }, { "$ref": "#/components/schemas/CategoricalStatisticsItem" }, { "$ref": "#/components/schemas/BoolStatisticsItem" } ] }, "StatisticsPerColumnItem": { "type": "object", "required": ["column_name", "column_type", "column_statistics"], "properties": { "column_name": { "type": "string" }, "column_type": { "$ref": "#/components/schemas/ColumnType" }, "column_statistics": { "$ref": "#/components/schemas/SupportedStatistics" } } }, "StatisticsResponse": { "type": "object", "required": ["statistics", "num_examples"], "properties": { "statistics": { "type": "array", "items": { "$ref": "#/components/schemas/StatisticsPerColumnItem" } }, "num_examples": { "type": "integer" }, "partial": { "$ref": "#/components/schemas/Partial" } } }, "X-Error-Code-DatasetInBlockListError": { "type": "string", "const": "DatasetInBlockListError", "description": "The dataset is in the list of blocked datasets." }, "X-Error-Code-DatasetWithTooManyConfigsError": { "type": "string", "const": "DatasetWithTooManyConfigsError", "description": "The number of subsets of a dataset exceeded the limit." }, "X-Error-Code-ExternalAuthenticatedError": { "type": "string", "const": "ExternalAuthenticatedError", "description": "Raised when the external authentication check failed while the user was authenticated. Even if the external authentication server returns 403 in that case, we return 404 because we don't know if the dataset exist or not. It's also coherent with how the Hugging Face Hub works." }, "X-Error-Code-ExternalUnauthenticatedError": { "type": "string", "const": "ExternalUnauthenticatedError", "description": "The external authentication check failed while the user was unauthenticated." }, "X-Error-Code-ResponseNotFound": { "type": "string", "const": "ResponseNotFound", "description": "Raised when the response has not been found." }, "X-Error-Code-MissingRequiredParameter": { "type": "string", "const": "MissingRequiredParameter", "description": "A required parameter is missing." }, "X-Error-Code-ResponseNotReadyError": { "type": "string", "const": "ResponseNotReadyError", "description": "The response has not been processed yet." }, "X-Error-Code-RowsPostProcessingError": { "type": "string", "const": "RowsPostProcessingError", "description": "The rows could not be post-processed successfully." }, "X-Error-Code-StreamingRowsError": { "type": "string", "const": "StreamingRowsError", "description": "The rows could not be fetched in streaming mode." }, "X-Error-Code-UnexpectedError": { "type": "string", "const": "UnexpectedError", "description": "The job runner raised an unexpected error." } }, "securitySchemes": { "AuthorizationHuggingFaceApiToken": { "type": "http", "description": "The HuggingFace API token. Create a User Access Token with read access at https://huggingface.co/settings/tokens. You can also use an Organization API token. It gives access to the public datasets, to the [gated datasets](https://huggingface.co/docs/hub/datasets-gated) for which you have accepted the conditions, and to your private datasets if you're a [PRO user](https://huggingface.co/pricing) or if the dataset is under your [Enterprise Hub organization](https://huggingface.co/enterprise).", "scheme": "bearer", "bearerFormat": "A User Access Token is prefixed with `hf_`, while an Organization API token is prefixed with `api_org_`." }, "AuthorizationHuggingFaceJWT": { "type": "http", "description": "A JWT generated by the HuggingFace Hub, when it calls the API. This mechanism only works for JWT signed with the HuggingFace Hub's key. It gives access to the dataset.", "scheme": "bearer", "bearerFormat": "A JWT, prefixed with `jwt:`." } }, "examples": { "InexistentConfigError": { "summary": "The response is not found because the subset does not exist.", "description": "try with config=inexistent-subset.", "value": { "error": "Not found." } }, "InexistentDatasetError": { "summary": "The dataset does not exist.", "description": "try with dataset=inexistent-dataset.", "value": { "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." } }, "InexistentSplitError": { "summary": "The response is not found because the split does not exist.", "description": "try with split=inexistent-split.", "value": { "error": "Not found." } }, "AuthorizedPrivateDatasetError": { "summary": "The dataset is private, and you are not authorized.", "description": "try with dataset=severo/test_private.", "value": { "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." } }, "UnauthorizedPrivateDatasetError": { "summary": "The dataset is private, and you are authorized, but private datasets are not supported yet.", "description": "try with dataset=severo/test_private.", "value": { "error": "Not found." } }, "UnauthorizedGatedDatasetError": { "summary": "The dataset is public but gated, and you are not authenticated or authorized.", "description": "try with dataset=severo/test_gated.", "value": { "error": "The dataset does not exist, or is not accessible without authentication (private or gated). Please check the spelling of the dataset name or retry with authentication." } }, "MissingDatasetParameterError": { "summary": "The dataset parameter is missing.", "description": "try without setting ?dataset", "value": { "error": "Parameter 'dataset' is required" } }, "EmptyDatasetParameterError": { "summary": "The dataset parameter is empty.", "description": "try with ?dataset=", "value": { "error": "Parameter 'dataset' is required" } }, "MissingDatasetConfigSplitParameterError": { "summary": "One of the dataset, config or split parameters is missing.", "description": "try without setting ?dataset", "value": { "error": "Parameters 'dataset', 'config' and 'split' are required" } }, "EmptyDatasetConfigSplitParameterError": { "summary": "One of the dataset, config or split parameters is empty.", "description": "try with ?dataset=", "value": { "error": "Parameters 'dataset', 'config' and 'split' are required" } }, "ResponseNotReadyError": { "summary": "The response is not ready yet. You can retry later. The response header 'x-error-code' contains 'ResponseNotReady'.", "description": "Create a new dataset and try immediately, before the response could be generated.", "value": { "error": "The server is busier than usual and the response is not ready yet. Please retry later." } }, "UnexpectedJsonError": { "summary": "The server encountered an unexpected error", "description": "This error indicates a bug in the code or a failure in the infrastructure. It can be reported to https://github.com/huggingface/dataset-viewer/issues.", "value": { "error": "Unexpected error." } }, "UnexpectedTextError": { "summary": "The server encountered an unexpected error", "description": "This error indicates a bug in the code or a failure in the infrastructure. It can be reported to https://github.com/huggingface/dataset-viewer/issues.", "value": "Internal Server Error." } }, "parameters": { "RequiredDataset": { "name": "dataset", "in": "query", "description": "The identifier of the dataset on the Hub.", "required": true, "schema": { "type": "string" }, "examples": { "ylecun/mnist": { "summary": "A dataset", "value": "ylecun/mnist" } } }, "RequiredConfig": { "name": "config", "in": "query", "description": "The dataset subset (also called 'configuration').", "required": true, "schema": { "type": "string" }, "examples": { "cola": { "summary": "A subset of the nyu-mll/glue dataset", "value": "cola" }, "yangdong/ecqa": { "summary": "The default configuration given by the 🤗 Datasets library", "value": "default" } } }, "RequiredSplit": { "name": "split", "in": "query", "description": "The split name.", "required": true, "schema": { "type": "string" }, "examples": { "train": { "summary": "train split", "value": "train" }, "test": { "summary": "test split", "value": "test" }, "validation": { "summary": "validation split", "value": "validation" } } }, "OptionalConfig": { "name": "config", "in": "query", "description": "The dataset subset on which to filter the response.", "schema": { "type": "string" }, "examples": { "cola": { "summary": "A subset of the nyu-mll/glue dataset", "value": "cola" }, "yangdong/ecqa": { "summary": "The default subset given by the 🤗 Datasets library", "value": "default" } } }, "OptionalSplit": { "name": "split", "in": "query", "description": "The split name.", "schema": { "type": "string" }, "examples": { "train": { "summary": "train split", "value": "train" }, "test": { "summary": "test split", "value": "test" }, "validation": { "summary": "validation split", "value": "validation" } } } }, "responses": { "Common401": { "description": "If the external authentication step on the Hugging Face Hub failed, and no authentication mechanism has been provided. Retry with authentication.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-401" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "inexistent dataset, and not authenticated": { "$ref": "#/components/examples/InexistentDatasetError" }, "private dataset, and not authenticated or authorized": { "$ref": "#/components/examples/UnauthorizedPrivateDatasetError" } } } } }, "Dataset404": { "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-404" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "inexistent dataset, while authenticated": { "$ref": "#/components/examples/InexistentDatasetError" }, "private dataset, while authenticated and authorized": { "$ref": "#/components/examples/AuthorizedPrivateDatasetError" }, "gated dataset, and not authenticated or authorized": { "$ref": "#/components/examples/UnauthorizedGatedDatasetError" } } } } }, "Dataset422": { "description": "The 'dataset' parameter has not been provided or is invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "missing dataset parameter": { "$ref": "#/components/examples/MissingDatasetParameterError" }, "empty dataset parameter": { "$ref": "#/components/examples/EmptyDatasetParameterError" } } } } }, "DatasetConfig404": { "description": "If the repository to download from cannot be found. This may be because it doesn't exist, or because it is set to `private` and you do not have access.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-404" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "inexistent dataset, while authenticated": { "$ref": "#/components/examples/InexistentDatasetError" }, "private dataset, while authenticated and authorized": { "$ref": "#/components/examples/AuthorizedPrivateDatasetError" }, "gated dataset, and not authenticated or authorized": { "$ref": "#/components/examples/UnauthorizedGatedDatasetError" }, "inexistent config": { "$ref": "#/components/examples/InexistentConfigError" } } } } }, "DatasetConfigSplit404": { "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-404" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "inexistent dataset, while authenticated": { "$ref": "#/components/examples/InexistentDatasetError" }, "private dataset, while authenticated and authorized": { "$ref": "#/components/examples/AuthorizedPrivateDatasetError" }, "gated dataset, and not authenticated or authorized": { "$ref": "#/components/examples/UnauthorizedGatedDatasetError" }, "inexistent config": { "$ref": "#/components/examples/InexistentConfigError" }, "inexistent split": { "$ref": "#/components/examples/InexistentSplitError" } } } } }, "DatasetConfigSplit422": { "description": "Some of the 'dataset', 'config' or 'split' parameters have not been provided or are invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "missing required parameter": { "$ref": "#/components/examples/MissingDatasetConfigSplitParameterError" }, "empty required parameter": { "$ref": "#/components/examples/EmptyDatasetConfigSplitParameterError" } } } } } } }, "paths": { "/splits": { "get": { "summary": "List of splits", "description": "The list of splits of a dataset.", "externalDocs": { "description": "See Splits (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/splits" }, "operationId": "listSplits", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "A list of splits.
Beware: the response is not paginated.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/SplitsResponse" }, "examples": { "all splits in a dataset": { "summary": "ibm/duorc: two configs, six splits", "description": "Try with https://datasets-server.huggingface.co/splits?dataset=ibm/duorc.", "value": { "splits": [ { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "train" }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "validation" }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "test" }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "train" }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "validation" }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "test" } ], "pending": [], "failed": [] } }, "splits for a single subset": { "summary": "dair-ai/emotion has two subsets. Setting config=unsplit only returns the splits for this subset.", "description": "Try with https://datasets-server.huggingface.co/splits?dataset=dair-ai/emotion&config=unsplit.", "value": { "splits": [ { "dataset": "dair-ai/emotion", "config": "unsplit", "split": "train" } ] } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfig404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "The dataset is empty, or a file is missing, or some other error that prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/splits?dataset=severo/empty", "value": { "error": "The dataset is empty.", "cause_exception": "EmptyDatasetError", "cause_message": "The directory at hf://datasets/severo/empty@5db043c2aee5fe0f2118c134de45f7b2e3230fbc doesn't contain any data files", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/dataset/config_names.py\", line 56, in compute_config_names_response\n for config in sorted(get_dataset_config_names(path=dataset, use_auth_token=use_auth_token))\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/inspect.py\", line 351, in get_dataset_config_names\n dataset_module = dataset_module_factory(\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1486, in dataset_module_factory\n raise e1 from None\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1469, in dataset_module_factory\n return HubDatasetModuleFactoryWithoutScript(\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/load.py\", line 1032, in get_module\n else get_data_patterns(base_path, download_config=self.download_config)\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/data_files.py\", line 459, in get_data_patterns\n raise EmptyDatasetError(f\"The directory at {base_path} doesn't contain any data files\") from None\n", "datasets.data_files.EmptyDatasetError: The directory at hf://datasets/severo/empty@5db043c2aee5fe0f2118c134de45f7b2e3230fbc doesn't contain any data files\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "too many subsets in the dataset": { "summary": "The dataset has too many subsets. The server does not support more than 3,000 subsets.", "description": "Try with https://datasets-server.huggingface.co/splits?dataset=facebook/flores", "value": { "error": "The maximum number of configs allowed is 3000, dataset has 41617 configs." } } } } } } }, "requestBody": { "content": {} } } }, "/first-rows": { "get": { "summary": "First rows of a split", "description": "The list of the 100 first rows of a dataset split.", "externalDocs": { "description": "See First rows (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/first-rows" }, "operationId": "listFirstRows", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/RequiredConfig" }, { "$ref": "#/components/parameters/RequiredSplit" } ], "responses": { "200": { "description": "The features, and the 100 first rows of the split.
Note: the response can be truncated (less rows, or truncated cell contents): see examples.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/FirstRowsResponse" }, "examples": { "A simple dataset (stanfordnlp/stanfordnlp/imdb) with text and label": { "summary": "Text, and label column. Only 3 rows are shown for brevity.", "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=stanfordnlp/imdb&config=plain_text&split=train.", "value": { "dataset": "stanfordnlp/imdb", "config": "plain_text", "split": "train", "features": [ { "feature_idx": 0, "name": "text", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, "name": "label", "type": { "names": ["neg", "pos"], "_type": "ClassLabel" } } ], "rows": [ { "row_idx": 0, "row": { "text": "I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered \"controversial\" I really had to see this for myself.

The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.

What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. While my countrymen mind find it shocking, in reality sex and nudity are a major staple in Swedish cinema. Even Ingmar Bergman, arguably their answer to good old boy John Ford, had sex scenes in his films.

I do commend the filmmakers for the fact that any sex shown in the film is shown for artistic purposes rather than just to shock people and make money to be shown in pornographic theaters in America. I AM CURIOUS-YELLOW is a good film for anyone wanting to study the meat and potatoes (no pun intended) of Swedish cinema. But really, this film doesn't have much of a plot.", "label": 0 }, "truncated_cells": [] }, { "row_idx": 1, "row": { "text": "\"I Am Curious: Yellow\" is a risible and pretentious steaming pile. It doesn't matter what one's political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn't true. I've seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don't exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we're treated to the site of Vincent Gallo's throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) \"double-standard\" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, and the same cannot be said for a man. In fact, you generally won't see female genitals in an American film in anything short of porn or explicit erotica. This alleged double-standard is less a double standard than an admittedly depressing ability to come to terms culturally with the insides of women's bodies.", "label": 0 }, "truncated_cells": [] }, { "row_idx": 2, "row": { "text": "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.

One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).

One might better spend one's time staring out a window at a tree growing.

", "label": 0 }, "truncated_cells": [] } ], "truncated": false } }, "Truncated cells": { "summary": "Truncated cells due to the response size (has a timestamp column).", "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=ETDataset/ett&config=m2&split=test.", "value": { "dataset": "ETDataset/ett", "config": "m2", "split": "test", "features": [ { "feature_idx": 0, "name": "start", "type": { "dtype": "timestamp[s]", "_type": "Value" } }, { "feature_idx": 1, "name": "target", "type": { "feature": { "dtype": "float32", "_type": "Value" }, "_type": "List" } }, { "feature_idx": 2, "name": "feat_static_cat", "type": { "feature": { "dtype": "uint64", "_type": "Value" }, "_type": "List" } }, { "feature_idx": 3, "name": "feat_dynamic_real", "type": { "feature": { "feature": { "dtype": "float32", "_type": "Value" }, "_type": "List" }, "_type": "List" } }, { "feature_idx": 4, "name": "item_id", "type": { "dtype": "string", "_type": "Value" } } ], "rows": [ { "row_idx": 0, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 1, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 2, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 3, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 4, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 5, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 6, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 7, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 8, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] }, { "row_idx": 9, "row": { "start": "2016-07-01T00:00:00", "target": "[38.6619987487793,38.222999572753906,37.34400177001953,37.124000549316406,37.124000549316406,36.9039", "feat_static_cat": [0], "feat_dynamic_real": "[[41.130001068115234,39.62200164794922,38.86800003051758,35.518001556396484,37.52799987792969,37.611", "item_id": "OT" }, "truncated_cells": ["target", "feat_dynamic_real"] } ], "truncated": true } }, "Image column": { "summary": "A column with images. Only 3 rows are shown for brevity.", "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=huggan/horse2zebra&config=default&split=train.", "value": { "dataset": "huggan/horse2zebra", "config": "default", "split": "train", "features": [ { "feature_idx": 0, "name": "imageA", "type": { "_type": "Image" } }, { "feature_idx": 1, "name": "imageB", "type": { "_type": "Image" } } ], "rows": [ { "row_idx": 0, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/0/imageA/image.jpg?Expires=1710430932&Signature=zozXKy248etfBQFS4sfUuMwtEvD1OND8ioc3UN5tQ3yYzXdZFo4BiJI5oY0TfE83z~CtLL-Ni5j3lwlBV5L6wO8ymYlR-4CLRyazIx0-1kxQTSKgK2-Ae6uUpYtqqN7tOZMHUYT8l71nGp~YQ8nFhJ9Nlzom3Lc~slnkHWFktExeEo8ZEYH6zDB3wy6exo6~WdD2dMDiVne1aCuwcs~RidUXMhIllYA-WjWC9sZMXWdSzu41YV3x~WJd5U02eSZg04-gXcVOOW8KSgYuDzGZWA-rFTKz054-BIQxvKb4Hy1joUHVSbrSx88qGTJFfeGNGy9DCYX~vBc9zZ8~l8WE7w__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/0/imageB/image.jpg?Expires=1710430932&Signature=eFyrF9DAKw3m8vbebm8CBym~byHbzZbTNwYWd0LqZ3rvM8KofUMGCLTERtH~oIUqWmFBbTWXa4IewozsWR5H9VMCgGAFF-2fXOntrAu~QgkNB5pAt3BAevMjnyCNYX-SQjgDeJ5M6p9oaHt4iFX210ITI~o0BVabXX5RJmEJUY3LZFh1-VX5ZI9hub-7hx661sGH5avEnSvz5xtafLaZSYzD9pcoNJuE9hWQOeC~nv7JhXamT~~ZG-iKyJyWJXg9umK6nAIPewAJBOPQivoITzirhGMG~qFvkJbpWrgmsYmr~~swYm8X0bMHUSr0Zj1yXKxMRjVBCzgtL~1eym7-mQ__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] }, { "row_idx": 1, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/1/imageA/image.jpg?Expires=1710430932&Signature=nKvinmfHvG~dlANvbld6LWfM~5k2-DoDJWdesj-YlqGynJ7J83Pr1tGbIlJPBrA~ugl~zGt5i~DtUlogAXJo~ofSaH7WrgI5ffWnBMLgzvXKPa7Ktm5EkbXJTOCubeA9kHStUZ0h3yQJrzkP1wbbwFkaI7tKfI14MhRWUCbgrPUTaZWUcSMK28YZvCpJlYJMYug8uu~aMXE1TJj~zbvMHKBGJoTIksVlf7E7MK6fAqYOl7lNeEUd5qGrN9flvhY23TR9tmFncZFfxCRZItbkyrvCWCgs2pSHdMBYCuS-Io4Bc4r4FFlYUvB1aVrgf7LizqPU9gGk-BRMis5iT~LdjA__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/1/imageB/image.jpg?Expires=1710430932&Signature=ZclSwX9zuO8R~hcYPW4LC8uLditjlHHXH9eDixM265CH1UOr1NT7-e~o7BkRBlRUt4XRJTzRrbCpCEGCOFg3f1Xhm411l3nXwvDHlLWFyF-h1yql3ERLxsS1Uk9IEnW2mNSWYBKO6ZpboXh~kWTsnbiOTTH6~GPNJD0weWqADiRdJYpzYOjZ6cG1I-AxUmkltgLOusp6Bj4fgOmbulMn2xn~ZLIvhCr5dnXbN6X2eGzGEnSVr742wH5FrlCPM66NhHhB8P4w0RAiPCp93dkFzXE3ramSffZ1vd-2BdZmHvMeupCYmcfEDxLwHnRJM0nc9FOQKqH2JhEmsMm~di0TKQ__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] }, { "row_idx": 2, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/2/imageA/image.jpg?Expires=1710430932&Signature=uq5l~PHqBWsTW2QtzPSjHdTaQZy0ku9i~5zkvvLZIwiatWPLr0DwXvHAuVBLUzlU2w9BnJl~wDEs7Oxh3TA-ELe9WuW9APek7ZeNWGmLt5MIjIj4pd6apCgLY6PoKhHot~l0at3HiAtduhoSzvY5ZOt2KYkMBUxbCsJSNOOE88xJh-E6gbODgXqVuN1Som6mpbj812IsK79lwqgHd8EiJQMNo7KCEY33g2Wu8~sArOIwUNwslCyOp1FUyZZpcU1h4ihu244tFA55LfG6folvjXzlQ1FxFjK84ZNnjqJzjaVRigsB9q8EZdeVq-86xZgJxhWqBzab9aBWjSrsoCwQ7A__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/2/imageB/image.jpg?Expires=1710430932&Signature=XLSxYM6ZOcJQ7zaF4plNYIddwC~DeCo9M2J4GV9DuJab3R1dXidz-0J-MfaCfVhujC6GWnIfvY0TDZJUbcXMbfOYwo9wjYvrtKixZnpnWtSq8Xsg~Xc-6XBK-imA8KMnD58GTy7KJ7xCbLYcduZKnuc6VtF664x9QcVrvQb00RoAzEcmAQkh877ELen5uTOnxTfSfesO9KjqG06br6yDrxzs0cGmr7aJLygKnNcPuABt87Fk~-oR41Kmb1iE0mXtcNJ1Dkbgifl9w9MBXCISJQMHD-Q8yM~shs5iY5TQV7wOBmBkj~m-vrBlli5GWmxl-8krU-GLGaJV3CfBway80g__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] } ], "truncated": true } }, "Audio column": { "summary": "A column with audio files. Only 3 rows are shown for brevity.", "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=facebook/voxpopuli&config=cs&split=train.", "value": { "dataset": "facebook/voxpopuli", "config": "cs", "split": "train", "features": [ { "feature_idx": 0, "name": "audio_id", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, "name": "language", "type": { "names": [ "en", "de", "fr", "es", "pl", "it", "ro", "hu", "cs", "nl", "fi", "hr", "sk", "sl", "et", "lt", "en_accented" ], "_type": "ClassLabel" } }, { "feature_idx": 2, "name": "audio", "type": { "sampling_rate": 16000, "_type": "Audio" } }, { "feature_idx": 3, "name": "raw_text", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, "name": "normalized_text", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 5, "name": "gender", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 6, "name": "speaker_id", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 7, "name": "is_gold_transcript", "type": { "dtype": "bool", "_type": "Value" } }, { "feature_idx": 8, "name": "accent", "type": { "dtype": "string", "_type": "Value" } } ], "rows": [ { "row_idx": 0, "row": { "audio_id": "20140204-0900-PLENARY-23-cs_20140204-22:43:32_1", "language": 8, "audio": [ { "src": "https://datasets-server.huggingface.co/assets/facebook/voxpopuli/--/cs/train/0/audio/audio.wav?Expires=1710431144&Signature=x1PmZKI9Atra5wr06NyIMkklk7uTLF2cUvwvxMK6kCzoPkR-OgtYdPtWw~4PNOvI~wnOoc3JiS39ym~YG8x1UjlBAVUypb80qVrrCT1ni5i0N5wvybcdwP56ts8pViwXgyrkbFT9WcImOCfeMdwEsTA~T6wFKYzklEx2kIOhIYkKSD65dUOt8FAsW5qloQ6pTs9RRoCXTQJtWfzpfs5rim1N~8BcaIupnC7ic3P~m-NkaGm-wJi8RWUjGXcepmGoSHV2OeeXB7xJ13JqPjK33QXdrrDxW59oR43Ikcr4NGuyKzuu1mEvS3VHrOpl87zW5qJyv3~ra1bRIoK3fegPtw__&Key-Pair-Id=K3EI6M078Z3AC3", "type": "audio/wav" } ], "raw_text": "Musí být transparentní a srozumitelný, firmám musí zaručovat právní jistotu, musí usilovat o odstraňování zbytečných byrokratických překážek, ale současně musí umět zaručit a vymoci i vysokou úroveň ochrany spotřebitelů, a zejména jejich bezpečnost.", "normalized_text": "musí být transparentní a srozumitelný firmám musí zaručovat právní jistotu musí usilovat o odstraňování zbytečných byrokratických překážek ale současně musí umět zaručit a vymoci i vysokou úroveň ochrany spotřebitelů a zejména jejich bezpečnost.", "gender": "female", "speaker_id": "96718", "is_gold_transcript": true, "accent": "None" }, "truncated_cells": [] }, { "row_idx": 1, "row": { "audio_id": "20160414-0900-PLENARY-10-cs_20160414-13:44:10_1", "language": 8, "audio": [ { "src": "https://datasets-server.huggingface.co/assets/facebook/voxpopuli/--/cs/train/1/audio/audio.wav?Expires=1710431144&Signature=bAG9Jlhy5WLZhKcCId4e3tEoCpozDe7exW44BsZf~iWJEX9i20LLv8wqGWqgrv0lxpUuIEiuEmncQGT8FMR-tW-MLSaTJf0fDoD0n0C78TVb5eGaTH0dIy8EHHULa7Rk-evgV2sGxaQkypmMaHlwRmIEgAqtHfapntIJzhURpz8b9rIlNZWggqdSW-vzi7iWJvSJXDeF9Oh94c5Hprz2EGK3Mz0760XlIfwcAk~abGrrh70TeRISflXISjw8zA25OUd9bU9Do~C~ggabPaoL3NW61dUxZ8jjfuPYusCXcYPIadA6Hl1AUEe71rrwaDGipbqGYgCUl-OaNpSB1eIzQQ__&Key-Pair-Id=K3EI6M078Z3AC3", "type": "audio/wav" } ], "raw_text": "Ve chvíli, kdy jsou data dostatečně chráněna předchozími právními texty, které také dnes byly schváleny, jsem přesvědčen, že vzhledem k trvale hrozícím teroristickým útokům v Evropě je nutné, aby policie při vyšetřování teroristických útoků a jiných forem závažné činnosti se mohla dostat k informacím, kdo kam letěl.", "normalized_text": "ve chvíli kdy jsou data dostatečně chráněna předchozími právními texty které také dnes byly schváleny jsem přesvědčen že vzhledem k trvale hrozícím teroristickým útokům v evropě je nutné aby policie při vyšetřování teroristických útoků a jiných forem závažné činnosti se mohla dostat k informacím kdo kam letěl.", "gender": "male", "speaker_id": "125706", "is_gold_transcript": true, "accent": "None" }, "truncated_cells": [] }, { "row_idx": 2, "row": { "audio_id": "20170912-0900-PLENARY-7-cs_20170912-12:00:04_3", "language": 8, "audio": [ { "src": "https://datasets-server.huggingface.co/assets/facebook/voxpopuli/--/cs/train/2/audio/audio.wav?Expires=1710431144&Signature=uiFrPX6TSfUUuQCGhRmDnBIHs69N47qnktE2Xo0JwxF1As9Gj8GQTKT~yowyPrALvrdZ-SGIfLHFQAkjcZjo9bKJ1nIy4vx34KZgR7HHM~xwk~ig4Fpvk4pe7f4gBVSwtYf13YbI4eheGg5i5Bc4ORhuLj5fAIZJ0FYyPlVcEd9SlNU-FEFHFryNyoBx2D-KJSyxKF7kNWfBE3tkXvDbhWrgkh0Oa5lrBwz8Y83eeheT3lB1O07Ea4-UAUtjvXA6c-eIHUPelIXeOoeFgHi7LoagPCHhig2ysE~AfO-zugg7AsOzvJepLKQB3NMF26ZOPHl4WuZzHKoOPgngjTyFSA__&Key-Pair-Id=K3EI6M078Z3AC3", "type": "audio/wav" } ], "raw_text": "A já se vás tedy ptám Opravdu je toto způsob, kterým se představujeme spolupráci a ochranu spotřebitelů?", "normalized_text": "a já se vás tedy ptám opravdu je toto způsob kterým se představujeme spolupráci a ochranu spotřebitelů?", "gender": "female", "speaker_id": "23699", "is_gold_transcript": true, "accent": "None" }, "truncated_cells": [] } ], "truncated": true } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "$ref": "#/components/responses/DatasetConfigSplit422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-first-rows" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/first-rows?dataset=allenai/atomic&config=atomic&split=train", "value": { "error": "Cannot load the dataset split (in streaming mode) to extract the first rows.", "cause_exception": "FileNotFoundError", "cause_message": "https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 417, in _info\n await _file_info(\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 837, in _file_info\n r.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/aiohttp/client_reqrep.py\", line 1005, in raise_for_status\n raise ClientResponseError(\n", "aiohttp.client_exceptions.ClientResponseError: 404, message='Not Found', url=URL('https://maartensap.com/atomic/data/atomic_data.tgz')\n", "\nThe above exception was the direct cause of the following exception:\n\n", "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/utils.py\", line 363, in get_rows_or_raise\n return get_rows(\n", " File \"/src/services/worker/src/worker/utils.py\", line 305, in decorator\n return func(*args, **kwargs)\n", " File \"/src/services/worker/src/worker/utils.py\", line 341, in get_rows\n rows_plus_one = list(itertools.islice(ds, rows_max_number + 1))\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 981, in __iter__\n for key, example in ex_iterable:\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/iterable_dataset.py\", line 116, in __iter__\n yield from self.generate_examples_fn(**self.kwargs)\n", " File \"/tmp/modules-cache/datasets_modules/datasets/allenai/atomic/c0f0ec7d10713c41dfc87f0cf17f936b122d22e19216051217c99134d38f6d7b/atomic.py\", line 123, in _generate_examples\n for path, f in files:\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 866, in __iter__\n yield from self.generator(*self.args, **self.kwargs)\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 917, in _iter_from_urlpath\n with xopen(urlpath, \"rb\", use_auth_token=use_auth_token) as f:\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/datasets/download/streaming_download_manager.py\", line 498, in xopen\n file_obj = fsspec.open(file, mode=mode, *args, **kwargs).open()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 134, in open\n return self.__enter__()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/core.py\", line 102, in __enter__\n f = self.fs.open(self.path, mode=mode)\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/spec.py\", line 1199, in open\n f = self._open(\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 356, in _open\n size = size or self.info(path, **kwargs)[\"size\"]\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 115, in wrapper\n return sync(self.loop, func, *args, **kwargs)\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 100, in sync\n raise return_result\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/asyn.py\", line 55, in _runner\n result[0] = await coro\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/fsspec/implementations/http.py\", line 430, in _info\n raise FileNotFoundError(url) from exc\n", "FileNotFoundError: https://homes.cs.washington.edu/~msap/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": {} } } } } } }, "/rows": { "get": { "summary": "A slice of rows of a split", "description": "The list of rows of a dataset split at a given slice location (offset). Up to 100 rows are returned, use the length parameter to get less.", "externalDocs": { "description": "See rows (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/rows" }, "operationId": "listRows", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/RequiredConfig" }, { "$ref": "#/components/parameters/RequiredSplit" }, { "name": "offset", "in": "query", "description": "The offset of the slice.", "schema": { "type": "integer", "default": 0, "minimum": 0 }, "examples": { "0": { "summary": "from the beginning", "value": 0 }, "100": { "summary": "from the row at index 100", "value": 100 } } }, { "name": "length", "in": "query", "description": "The length of the slice", "schema": { "type": "integer", "default": 100, "minimum": 0, "maximum": 100 }, "examples": { "100": { "summary": "a slice of 100 rows", "value": 100 } } } ], "responses": { "200": { "description": "The features, and the list of rows of the requested slice. Bytes columns are not supported at the moment, and their content will be 'null'.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/PaginatedResponse" }, "examples": { "A slice of a simple dataset (stanfordnlp/imdb)": { "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", "description": "Try with https://datasets-server.huggingface.co/rows?dataset=stanfordnlp/imdb&config=plain_text&split=train&offset=234&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "text", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, "name": "label", "type": { "names": ["neg", "pos"], "_type": "ClassLabel" } } ], "rows": [ { "row_idx": 234, "row": { "text": "Well, you know the rest! This has to be the worst movie I've seen in a long long time. I can only imagine that Stephanie Beaham had some bills to pay when taking on this role.

The lead role is played by (to me) a complete unknown and I would imagine disappeared right back into obscurity right after this turkey.

Bruce Lee led the martial arts charge in the early 70's and since then fight scenes have to be either martial arts based or at least brutal if using street fighting techniques. This movie uses fast cuts to show off the martial arts, however, even this can't disguise the fact that the lady doesn't know how to throw a punch. An average 8 year old boy would take her apart on this showing.

Sorry, the only mystery on show here is how this didn't win the golden raspberry for its year.", "label": 0 }, "truncated_cells": [] }, { "row_idx": 235, "row": { "text": "I'm in Iraq right now doing a job that gives plenty of time for watching movies. We also have access to plenty of pirated movies, this gem came along with 11 other movies, and this is easily the worst I've seen in a long time. I've seen a few other reviews that claim this movie doesn't take itself too seriously, but really, I think that's a cover up for the fact that its horrible. It's not tongue in cheek, the writers really thought they were improving on the movie Blade. This movie is just one notch above Vampire Assassin, which if you haven't seen, i recommend. At least that movie is so unbelievably bad that you'll laugh harder than you thought possible. This is right at that cusp of no redeeming qualities what so ever. from the bad acting, to cliché visual (ie opening credits), to the adobe premier special effects. they couldn't even get blanks for the guns, which may have to do with where the movie was filmed, but if you're going to use effects, make them close to accurate. as for the cast, it seems like they just went to a tae bo class and picked up the first not to ugly chick that walked out. Once again, like Ron Hall in Vampire Assassin, don't let stunt folk act, they can't. Also, the comment about this being a \"return of old vampire movies\"...no, it's not. This is exactly what all new vampire movies are about. Buffy the Vampire Slayer, Blade, Underworld, they're all about some super star fighting the vampires. This is the newest vampire genre, with bad blood, fake screams, and cheesy over acting. obviously anyone who wrote a good review about this is somehow connected to the movie, or friends of the cast. But what do I care, I paid 33 cents for it. Anyway, to wrap this up, someone in their first semester of film school decided to make a movie, I give them credit because it's better than I could do. Of course I also know I can't make movies so I don't try. I do know how to watch movies though. I work 12 hour nights, 6 days a week, I've seen several thousand in the year I've been out here and this was so bad that half way through i was hoping for a mortar attack.", "label": 0 }, "truncated_cells": [] }, { "row_idx": 236, "row": { "text": "\"Valentine\" is another horror movie to add to the stalk and slash movie list (think \"Halloween\", \"Friday the 13th\", \"Scream\", and \"I Know What You Did Last Summer\"). It certainly isn't as good as those movies that I have listed about, but it's better than most of the ripoffs that came out after the first \"Friday the 13th\" film. One of those films was the 1981 Canadian made \"My Bloody Valentine\", which I hated alot. \"Valentine\" is a better film than that one, but it's not saying much. The plot: a nerdy young boy is teased and pranked by a couple of his classmates at the beginning of the film. Then the film moves years later when those classmates are all grown up, then they're picked off one-by-one. The killer is presumed to be the young boy now all grown up looking for revenge. But is it him? Or could it be somebody else? \"Valentine\" has an attractive cast which includes Denise Richards, David Boreanaz, Marley Shelton, Jessica Capshaw, and Katherine Heigl. They do what they can with the material they've got, but a lackluster script doesn't really do them any justice. There are some scary moments throughout, however.

** (out of four)", "label": 0 }, "truncated_cells": [] } ], "num_rows_total": 25000, "num_rows_per_page": 100, "partial": false } }, "A slice of an image dataset (huggan/horse2zebra)": { "summary": "Get a slice of length 3 from row 234 (offset=234&length=3).", "description": "Try with https://datasets-server.huggingface.co/rows?dataset=huggan/horse2zebra&config=default&split=train&offset=234&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "imageA", "type": { "_type": "Image" } }, { "feature_idx": 1, "name": "imageB", "type": { "_type": "Image" } } ], "rows": [ { "row_idx": 234, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/234/imageA/image.jpg?Expires=1710431246&Signature=a~CeO0qzr8yyQwufhla6PCYoP53zDNPzw30GOeffqrcWBAFr05C5n~7aAEcOpZAqLVlQJ4YvjubfYVS-7JT2wlYtwhZsFyGexuCTmOw4XqS~l-gkq-H9f7aT8yrt3rQADyWilIP0sT2kt4Co2K2vKLtSDJr4dVEf2Qfm4cWbjJwyWvokJxRvPDF31eM86BV92iSMddKmfcvTKiOj4A21wAWFZDdZlmUtM0hb4xi3eBmIx7w95WYjgzFgsXgbJE~7x7rZSYvu1v6Z90LYJUbEljkuUWBe2bdL77E4ThJluGW6nZduvldJm23gYE7bQDV3oXMf-AFCLC5qS86xQ9ztcA__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/234/imageB/image.jpg?Expires=1710431246&Signature=YokwalGNudb3nd3WL-U9TUGMczp~BTcLv5Xp6Pi9RXqCcHyaP4~UC8XvVoBTl5D3ikoojPXwEZKi849S9nJ0HTYwJqPznheNgKU0tFwmw~A6-aFlqkamNVXP9IoN8o8nfmWpt4T9etM9OzzHO9xE~7Lci98~lVsQa0qrjwrgYklDFlHs8aWnXHb-ZgLiuf6A1JTsXniWPfJxy-Yqylq453KDnOsu08I9LLen~huTGuR1Vsk3mpyED3Rj~k1lxIOkCTavexHIVtSg7LZRpCElt~S4QeT6wk6XjCYKtjidIzh2XNBmdPTtbf3a40Y0xEeRWIa6A9T9SfVH0VSgspfbzg__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] }, { "row_idx": 235, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/235/imageA/image.jpg?Expires=1710431246&Signature=E~04rHNmVaJGeXI4Q3uILInhuwtB9S0ifUK-wbO-wpE3Qpb~39W~0BRGEiUwTVwtsnyO7ExqXm-oda2ak1km-0cN-gnhRsLtsatHBbVYpxlsYhEI1NRSGJEgo5wKpop~z5iaYQYXZD4G77besdS-H8aufz1mDOoOKq~rx~Xr6u9xsuo3wxbkFwBHw8TQjl2VO~55kVUvjuzNKgZsIcBNBNJnI8LHEKoz60KQjXXG3tNX4-td1pnV6W~AYseti2TdqsUG0wf53Cr-FT9D0IvOiJKcLgbyFfTVcbjivzVzSfsYjVbFYcj90u021ZAT9W2si2RNVoOFrAMat7EeSe4SQA__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/235/imageB/image.jpg?Expires=1710431246&Signature=Qtqq6QcXJMmq2f13O9zAhIACS3IXlCQB-8xx1wIzTl4q~G5bbMC3NtTDfMSGgKAcQm-riU7UdQCQyZMw-188b5BFZm77ms1dSYMz3eni4F468l9-QXmpt8OQ7BzS19~XRw264B~vOgkIIbOKum-NU-Tbu4HxXEOwjbr2Km2vpvb2DflznRmDJwdDnb4oNzclwBMUd5moNQ93ZkAgiYudI4jqyk0bdcBG6yRv3GGakt-QljCePWkAt4xWeSx84JcVlP1almAiHWqJAWI3brkEFUiB0pJNT~ZJ90z0kes-tuHPd3h6UmTBcHGWtdc-bjKFKHYC6giQ8~vIO9unRXoMfA__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] }, { "row_idx": 236, "row": { "imageA": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/236/imageA/image.jpg?Expires=1710431246&Signature=mBVTb2gxW49b8chEm5N-rmRMY6IvR~h~neKaexUAwROM4y7QthKjJQu7yv0W9gv3s0n2Dqyi~rxlGcd2NaS1qHMig4BC9rgA-I5fmmrsaoLAnvmk4gy3bXodS5N92lWLNvwCO6~gsZBbkyAu6f2eIJZXMilwKziB9rDTCRh4NqPp9g-EPjNXvhk4G6ZoaNzGKhb7lIdnF~-GDyuvQiN~HzesR7gcw992P-ejSapd4Kch9Yem~XvcplgZ9wXHEeCvg42G-TDY6JbvkJfWx-TIOVUg1gGpK8wYmvMqxM952vYbDzke7HqPm8HOiYKm850k5BqCKXVlmEX9zG3vI6fmxQ__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 }, "imageB": { "src": "https://datasets-server.huggingface.co/cached-assets/huggan/horse2zebra/--/67e0da2e44e860e37857edf17af7b2656b3be221/--/default/train/236/imageB/image.jpg?Expires=1710431247&Signature=IUDp-l1heP5wXtcAIcAHmf8XUNB7VLjAVmAo0919y2nj6R4SbD7frnrLSnOwMc6dDTNWiKck0u2W7z0JRuk2jdIoezL32pY0y1kkIZX66oARp3xQlClacJtK44wRFTI-njngUAPdWujG2gX05YTZSd4k0NSmDKQAXVx1ckUMjzFUqAjwj-~U6KPW4k656Tq95wXMGe7~RrYSygcJ8ZjaKiZgUySwc6lJbVVPDPc5hIeq-HGuqtPiyy0m-pasixsckwQu-DdQiPyoq27manjdZ8pQ7apn9UOAwZBhKEYbkbDVMYNfRhM-BG3VMCbOE8xqvQXxLGqv15AVhVHlCZd~oQ__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 256, "width": 256 } }, "truncated_cells": [] } ], "num_rows_total": 1334, "num_rows_per_page": 100, "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "description": "If the repository to download from cannot be found, or if the config or split does not exist in the dataset. Note that this may be because the dataset doesn't exist, or because it is set to `private` and you do not have access.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-404" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "inexistent dataset, while authenticated": { "$ref": "#/components/examples/InexistentDatasetError" }, "private dataset, while authenticated and authorized": { "$ref": "#/components/examples/AuthorizedPrivateDatasetError" }, "gated dataset, and not authenticated or authorized": { "$ref": "#/components/examples/UnauthorizedGatedDatasetError" }, "inexistent config": { "$ref": "#/components/examples/InexistentConfigError" }, "inexistent split": { "$ref": "#/components/examples/InexistentSplitError" }, "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/rows?dataset=allenai/atomic&config=atomic&split=train. It's a bug, it should be a 500 error, see https://github.com/huggingface/dataset-viewer/issues/1661.", "value": { "error": "Not found." } } } } } }, "422": { "description": "Some of the 'dataset', 'config', 'split', 'offset' or 'length' parameters have not been provided or are invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "missing-dataset": { "summary": "The dataset parameter is missing.", "value": { "error": "Parameter 'dataset' is required" } }, "missing-config": { "summary": "The config parameter is missing.", "value": { "error": "Parameter 'config' is required" } }, "missing-split": { "summary": "The split parameter is missing.", "value": { "error": "Parameter 'split' is required" } }, "empty-dataset": { "summary": "The dataset parameter is empty.", "value": { "error": "Parameter 'dataset' is required" } }, "empty-config": { "summary": "The config parameter is empty.", "value": { "error": "Parameter 'config' is required" } }, "empty-split": { "summary": "The split parameter is empty.", "value": { "error": "Parameter 'split' is required" } }, "non-integer-offset": { "summary": "The offset must be integer.", "value": { "error": "Parameter 'offset' must be integer" } }, "negative-offset": { "summary": "The offset must be positive.", "value": { "error": "Parameter 'offset' must be positive" } }, "non-integer-length": { "summary": "The length must be integer.", "value": { "error": "Parameter 'length' must be integer" } }, "negative-length": { "summary": "The length must be positive.", "value": { "error": "Parameter 'length' must be positive" } }, "too-large-length": { "summary": "The length must not be too large.", "value": { "error": "Parameter 'length' must not be greater than 100" } } } } } }, "500": { "description": "The server crashed, or the response couldn't be generated successfully due to an error in the dataset itself. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-rows" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": {} } } } } } }, "/search": { "get": { "summary": "Full-text search in the text columns of a split", "description": "Returns the rows matching the query, ordered by row index. Up to 100 rows are returned. The offset and length parameters allow to navigate the results.", "externalDocs": { "description": "See search (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/search" }, "operationId": "searchRows", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/RequiredConfig" }, { "$ref": "#/components/parameters/RequiredSplit" }, { "name": "query", "in": "query", "description": "The search query.", "required": true, "schema": { "type": "string" }, "examples": { "dog": { "summary": "search the rows that contain the text 'dog'", "value": "dog" } } }, { "name": "offset", "in": "query", "description": "The offset of the returned rows.", "schema": { "type": "integer", "default": 0, "minimum": 0 }, "examples": { "0": { "summary": "from the beginning", "value": 0 }, "100": { "summary": "ignore the first 100 results", "value": 100 } } }, { "name": "length", "in": "query", "description": "The maximum number of returned rows", "schema": { "type": "integer", "default": 100, "minimum": 0, "maximum": 100 }, "examples": { "100": { "summary": "up to 100 rows in the response", "value": 100 } } } ], "responses": { "200": { "description": "The features, and the list of rows that match the search query. The query will only be searched among the string columns. Bytes columns are not supported at the moment, and their content will be 'null'.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/PaginatedResponse" }, "examples": { "Search on a text dataset (stanfordnlp/imdb)": { "summary": "The first 3 rows that match the 'dog' search query (query=dog&length=3).", "description": "Try with https://datasets-server.huggingface.co/search?dataset=stanfordnlp/imdb&config=plain_text&split=train&query=dog&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "text", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 1, "name": "label", "type": { "dtype": "int64", "_type": "Value" } } ], "rows": [ { "row_idx": 27, "row": { "text": "Pedantic, overlong fabrication which attempts to chronicle the birth of the Federal Bureau of Investigations. Begins quite promisingly, with a still-relevant probe into an airplane explosion, however the melodrama involving James Stewart and wife Vera Miles just gets in the way (Miles had a habit of playing tepid wives under duress, and her frayed nerves arrive here right on schedule). Esteemed director Mervyn LeRoy helmed this adaptation of Don Whitehead's book, but despite the talent involved, the picture fails to make much of an impression. Best performance is turned in by Murray Hamilton as Stewart's partner, however most of the dialogue is ludicrous and the dogged pacing causes the movie to seem twice as long as it is. *1/2 from ****", "label": 0 }, "truncated_cells": [] }, { "row_idx": 51, "row": { "text": "The opening shot was the best thing about this movie, because it gave you hope that you would be seeing a passionate, well-crafted independent film. Damn that opening shot for filling me hope. As the \"film\" progressed in a slow, plodding manner, my thoughts were varied in relation to this \"film\": Was there too much butter in my popcorn? Did the actors have to PAY the director to be in this \"film\"? Did I get my ticket validated at the Box Office? Yes, dear reader. I saw this film in the Theatre! This would be the only exception I will make about seeing a film at home over a Movie Theatre, because at home you can TURN IT OFF. Were there any redeeming values? Peter Lemongelli as the standard college \"nerd\" had his moments, especially in a dog collar. Other than that this \"film\" went from trying to be a comedy, to a family drama to a spiritual uplifter. It succeeded on none of these fronts. Oh, and the girlfriend was realllllllllly bad. Her performance was the only comedy I found.", "label": 0 }, "truncated_cells": [] }, { "row_idx": 106, "row": { "text": "I saw this movie at the AFI Dallas festival. Most of the audience, including my wife, enjoyed this comedy-drama, but I didn't. It stars Lucas Haas (Brick, Alpha Dog), Molly Parker (Kissed, The Five Senses, Hollywoodland) and Adam Scott (First Snow, Art School Confidential). The director is Matt Bissonnette, who's married to Molly Parker. All three actors do a fine job in this movie about 3 friends, the marriage of two of them and infidelity involving the third. It all takes place at a lake house and it looks wonderful. The film wants to treat its subject as a comedy first and then a drama, and I thought it needed to be the other way around.", "label": 0 }, "truncated_cells": [] } ], "num_rows_total": 624, "num_rows_per_page": 100, "partial": false } }, "Search on an image dataset": { "summary": "The first 3 rows that match the 'bird' search query (query=bird&length=3). Images are included.", "description": "Try with https://datasets-server.huggingface.co/search?dataset=lambdalabs/pokemon-blip-captions&config=default&split=train&query=bird&length=3.", "value": { "features": [ { "feature_idx": 0, "name": "image", "type": { "_type": "Image" } }, { "feature_idx": 1, "name": "text", "type": { "dtype": "string", "_type": "Value" } } ], "rows": [ { "row_idx": 661, "row": { "image": { "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/8b762e1dac1b31d60e01ee8f08a9d8a232b59e17/--/default/train/661/image/image.jpg?Expires=1710431284&Signature=Nqv3yF0ZwDk8fDY~zXtIDfm0e4y3GnxkcovxAE17TJZUeUzFZaSJ2XC6KXwVQU7oEhZ2hGw6SGbkqWPLLWqcMIptJFlfWWGIy4Vi8KdONzd-a3NiZPzViSmpu7DtbosRuSDSpWh6VU0tmkGE2mQAdDPAsVEo583A4FL2pAJokcMcTWEGM6b6brPyOv3Lw2aOee9bbrGzKPlYOw7oqBvA4KzozS~1k9h~BlFXFSk5tS9fYNdDHWfO33qC-dApDPWhnpJjgo0ohtoFX0qeh5os291V96IwaZOwGiRacVB4YDaddCUYyIvFBD54wO64HJwG0UNK-AT8duBl5bEpK41vkA__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 1280, "width": 1280 }, "text": "a cartoon bird with a re" }, "truncated_cells": [] }, { "row_idx": 671, "row": { "image": { "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/8b762e1dac1b31d60e01ee8f08a9d8a232b59e17/--/default/train/671/image/image.jpg?Expires=1710431284&Signature=trBC9GyNbPK8TU9llw7UhhNotJXwoTdzWFeuFU9yehtaVs5SNGqdwVz32tLmFeSrFFjrljEKcB9ulU9LmzcHfOjmuicoa91HTZ~aUoWOaIhwNH3~xt7cx8t7qWddn6JOXk1B1k-z540sjFPmfb1fpRxyO-fDBmrDKpQJenhr6vE63H5etVS6daengIGD0LQc9VdyTOrSa2FhBCyyjJSLAm5G0RgCgN3IxFEteBuUyW-CTXGeYvFmzxMJT8nDkPm~d0tnXnhpkcO7oVB5gzOmiwNQnlQVjWdQI5fRsouOUZvcocfXhfrJylmMewkJyj4nusq1Q9bTOUlDbur-dJ6CoQ__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 431, "width": 431 }, "text": "a drawing of a flower with a bird on it" }, "truncated_cells": [] }, { "row_idx": 706, "row": { "image": { "src": "https://datasets-server.huggingface.co/cached-assets/lambdalabs/pokemon-blip-captions/--/8b762e1dac1b31d60e01ee8f08a9d8a232b59e17/--/default/train/706/image/image.jpg?Expires=1710431284&Signature=t~Bdg2T0E3CFBq9SZv~q4FRbWB4PAQS1qKdNsw11X9Q2PBu9z2KQ9ksSB-Mq~79ZixKBwWP7vrIG3~r~OfdLJb18jALa1~m2ADCTnNLIFAl6pYendnvwwzq~cPlOWYy28DX3QCoMQfLvRL20aVCV9Ry8CbFKr-EL0MCd~c~s6QWSYoF7GKkeA6DnmWs7~19BpMbZqX7bvhxePenncPXDQlmW-aZ-4BcvFPOWccRSpjw38vLtYCY6aKfeOM7T8-x2-gdgFvazKOPLFIjrZ5S883Y7NaM3nQc-trUKX2hm0Ij4~wBEaWzqVNwommhXSvbAIS0im15K22AxGTzlvgH51w__&Key-Pair-Id=K3EI6M078Z3AC3", "height": 1137, "width": 1137 }, "text": "a bird with an arrow in its beak" }, "truncated_cells": [] } ], "num_rows_total": 93, "num_rows_per_page": 100, "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "description": "Some of the 'dataset', 'config', 'split', 'query', 'offset' or 'length' parameters have not been provided or are invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "missing-dataset": { "summary": "The dataset parameter is missing.", "value": { "error": "Parameter 'dataset' is required" } }, "missing-config": { "summary": "The config parameter is missing.", "value": { "error": "Parameter 'config' is required" } }, "missing-split": { "summary": "The split parameter is missing.", "value": { "error": "Parameter 'split' is required" } }, "missing-query": { "summary": "The query parameter is missing.", "value": { "error": "Parameter 'query' is required" } }, "empty-dataset": { "summary": "The dataset parameter is empty.", "value": { "error": "Parameter 'dataset' is required" } }, "empty-config": { "summary": "The config parameter is empty.", "value": { "error": "Parameter 'config' is required" } }, "empty-split": { "summary": "The split parameter is empty.", "value": { "error": "Parameter 'split' is required" } }, "empty-query": { "summary": "The query parameter is empty.", "value": { "error": "Parameter 'query' is required" } }, "non-integer-offset": { "summary": "The offset must be integer.", "value": { "error": "Parameter 'offset' must be integer" } }, "negative-offset": { "summary": "The offset must be positive.", "value": { "error": "Parameter 'offset' must be positive" } }, "non-integer-length": { "summary": "The length must be integer.", "value": { "error": "Parameter 'length' must be integer" } }, "negative-length": { "summary": "The length must be positive.", "value": { "error": "Parameter 'length' must be positive" } }, "too-large-length": { "summary": "The length must not be too large.", "value": { "error": "Parameter 'length' must not be greater than 100" } } } } } }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-search" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/search?dataset=allenai/atomic&config=atomic&split=train&query=dog", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/search?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation&query=test", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } }, "/filter": { "get": { "summary": "Filter rows of a split", "description": "Returns the rows matching the filter query, ordered by row index. Up to 100 rows are returned. The offset and length parameters allow to navigate the results.", "externalDocs": { "description": "See filter (Hub docs).", "url": "https://huggingface.co/docs/dataset-viewer/filter" }, "operationId": "filterRows", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/RequiredConfig" }, { "$ref": "#/components/parameters/RequiredSplit" }, { "name": "where", "in": "query", "description": "The filter query.", "required": true, "schema": { "type": "string" }, "examples": { "Age = 30": { "summary": "filter the rows where the 'Age' column equals to 30", "value": "Age = 30" } } }, { "name": "offset", "in": "query", "description": "The offset of the returned rows.", "schema": { "type": "integer", "default": 0, "minimum": 0 }, "examples": { "0": { "summary": "from the beginning", "value": 0 }, "100": { "summary": "ignore the first 100 results", "value": 100 } } }, { "name": "length", "in": "query", "description": "The maximum number of returned rows", "schema": { "type": "integer", "default": 100, "minimum": 0, "maximum": 100 }, "examples": { "100": { "summary": "up to 100 rows in the response", "value": 100 } } } ], "responses": { "200": { "description": "The features, and the list of rows that match the filter query. Bytes columns are not supported at the moment, and their content will be 'null'.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/PaginatedResponse" }, "examples": { "Filter on float equality (julien-c/titanic-survival)": { "summary": "The first 2 rows where the float 'Age' column is equal to 30 (where=Age=30&length=2).", "description": "Try with: https://datasets-server.huggingface.co/filter?dataset=julien-c/titanic-survival&config=default&split=train&where=Age=30&length=2", "value": { "features": [ { "feature_idx": 0, "name": "Survived", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 1, "name": "Pclass", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 2, "name": "Name", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 3, "name": "Sex", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, "name": "Age", "type": { "dtype": "float64", "_type": "Value" } }, { "feature_idx": 5, "name": "Siblings/Spouses Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 6, "name": "Parents/Children Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 7, "name": "Fare", "type": { "dtype": "float64", "_type": "Value" } } ], "rows": [ { "row_idx": 44, "row": { "Survived": 0, "Pclass": 3, "Name": "Mr. William John Rogers", "Sex": "male", "Age": 30.0, "Siblings/Spouses Aboard": 0, "Parents/Children Aboard": 0, "Fare": 8.05 }, "truncated_cells": [] }, { "row_idx": 78, "row": { "Survived": 1, "Pclass": 3, "Name": "Miss. Elizabeth Dowdell", "Sex": "female", "Age": 30.0, "Siblings/Spouses Aboard": 0, "Parents/Children Aboard": 0, "Fare": 12.475 }, "truncated_cells": [] } ], "num_rows_total": 33, "num_rows_per_page": 100, "partial": false } }, "Filter on string equality (julien-c/titanic-survival)": { "summary": "The first 2 rows where the string 'Sex' column is equal to 'female' (where=Sex='female'&length=2).", "description": "Try with: https://datasets-server.huggingface.co/filter?dataset=julien-c/titanic-survival&config=default&split=train&where=Sex='female'&length=2", "value": { "features": [ { "feature_idx": 0, "name": "Survived", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 1, "name": "Pclass", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 2, "name": "Name", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 3, "name": "Sex", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, "name": "Age", "type": { "dtype": "float64", "_type": "Value" } }, { "feature_idx": 5, "name": "Siblings/Spouses Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 6, "name": "Parents/Children Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 7, "name": "Fare", "type": { "dtype": "float64", "_type": "Value" } } ], "rows": [ { "row_idx": 1, "row": { "Survived": 1, "Pclass": 1, "Name": "Mrs. John Bradley (Florence Briggs Thayer) Cumings", "Sex": "female", "Age": 38.0, "Siblings/Spouses Aboard": 1, "Parents/Children Aboard": 0, "Fare": 71.2833 }, "truncated_cells": [] }, { "row_idx": 2, "row": { "Survived": 1, "Pclass": 3, "Name": "Miss. Laina Heikkinen", "Sex": "female", "Age": 26.0, "Siblings/Spouses Aboard": 0, "Parents/Children Aboard": 0, "Fare": 7.925 }, "truncated_cells": [] } ], "num_rows_total": 314, "num_rows_per_page": 100, "partial": false } }, "Filter on float inequality (julien-c/titanic-survival)": { "summary": "The first 2 rows where the float 'Fare' column is larger than 50 (where=Fare>50&length=2).", "description": "Try with: https://datasets-server.huggingface.co/filter?dataset=julien-c/titanic-survival&config=default&split=train&where=Fare>50&length=2", "value": { "features": [ { "feature_idx": 0, "name": "Survived", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 1, "name": "Pclass", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 2, "name": "Name", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 3, "name": "Sex", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, "name": "Age", "type": { "dtype": "float64", "_type": "Value" } }, { "feature_idx": 5, "name": "Siblings/Spouses Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 6, "name": "Parents/Children Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 7, "name": "Fare", "type": { "dtype": "float64", "_type": "Value" } } ], "rows": [ { "row_idx": 1, "row": { "Survived": 1, "Pclass": 1, "Name": "Mrs. John Bradley (Florence Briggs Thayer) Cumings", "Sex": "female", "Age": 38.0, "Siblings/Spouses Aboard": 1, "Parents/Children Aboard": 0, "Fare": 71.2833 }, "truncated_cells": [] }, { "row_idx": 3, "row": { "Survived": 1, "Pclass": 1, "Name": "Mrs. Jacques Heath (Lily May Peel) Futrelle", "Sex": "female", "Age": 35.0, "Siblings/Spouses Aboard": 1, "Parents/Children Aboard": 0, "Fare": 53.1 }, "truncated_cells": [] } ], "num_rows_total": 160, "num_rows_per_page": 100 } }, "Filter on logical AND (julien-c/titanic-survival)": { "summary": "The first 2 rows where the integer 'Pclass' column is equal to 2 and the integer 'Siblings/Spouses Aboard' column is larger then 0 (where=Pclass=2 AND \"Siblings/Spouses Aboard\">0&length=2).", "description": "Try with: https://datasets-server.huggingface.co/filter?dataset=julien-c/titanic-survival&config=default&split=train&where=Pclass=2 AND \"Siblings/Spouses Aboard\">0&length=2", "value": { "features": [ { "feature_idx": 0, "name": "Survived", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 1, "name": "Pclass", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 2, "name": "Name", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 3, "name": "Sex", "type": { "dtype": "string", "_type": "Value" } }, { "feature_idx": 4, "name": "Age", "type": { "dtype": "float64", "_type": "Value" } }, { "feature_idx": 5, "name": "Siblings/Spouses Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 6, "name": "Parents/Children Aboard", "type": { "dtype": "int64", "_type": "Value" } }, { "feature_idx": 7, "name": "Fare", "type": { "dtype": "float64", "_type": "Value" } } ], "rows": [ { "row_idx": 9, "row": { "Survived": 1, "Pclass": 2, "Name": "Mrs. Nicholas (Adele Achem) Nasser", "Sex": "female", "Age": 14.0, "Siblings/Spouses Aboard": 1, "Parents/Children Aboard": 0, "Fare": 30.0708 }, "truncated_cells": [] }, { "row_idx": 41, "row": { "Survived": 0, "Pclass": 2, "Name": "Mrs. William John Robert (Dorothy Ann Wonnacott) Turpin", "Sex": "female", "Age": 27.0, "Siblings/Spouses Aboard": 1, "Parents/Children Aboard": 0, "Fare": 21.0 }, "truncated_cells": [] } ], "num_rows_total": 64, "num_rows_per_page": 100, "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "description": "Some of the 'dataset', 'config', 'split', 'where', 'offset' or 'length' parameters have not been provided or are invalid.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-422" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "missing-dataset": { "summary": "The dataset parameter is missing.", "value": { "error": "Parameter 'dataset' is required" } }, "missing-config": { "summary": "The config parameter is missing.", "value": { "error": "Parameter 'config' is required" } }, "missing-split": { "summary": "The split parameter is missing.", "value": { "error": "Parameter 'split' is required" } }, "empty-dataset": { "summary": "The dataset parameter is empty.", "value": { "error": "Parameter 'dataset' is required" } }, "empty-config": { "summary": "The config parameter is empty.", "value": { "error": "Parameter 'config' is required" } }, "empty-split": { "summary": "The split parameter is empty.", "value": { "error": "Parameter 'split' is required" } }, "non-integer-offset": { "summary": "The offset must be integer.", "value": { "error": "Parameter 'offset' must be integer" } }, "negative-offset": { "summary": "The offset must be positive.", "value": { "error": "Parameter 'offset' must be positive" } }, "non-integer-length": { "summary": "The length must be integer.", "value": { "error": "Parameter 'length' must be integer" } }, "negative-length": { "summary": "The length must be positive.", "value": { "error": "Parameter 'length' must be positive" } }, "too-large-length": { "summary": "The length must not be too large.", "value": { "error": "Parameter 'length' must not be greater than 100" } }, "where-with-invalid-symbols": { "summary": "The where parameter contains invalid symbols.", "value": { "error": "Parameter 'where' contains invalid symbols" } }, "orderby-with-invalid-symbols": { "summary": "The orderby parameter contains invalid symbols.", "value": { "error": "Parameter 'orderby' contains invalid symbols" } }, "invalid-parameter": { "summary": "A query parameter is invalid.", "value": { "error": "A query parameter is invalid" } } } } } }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-search" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/filter?dataset=allenai/atomic&config=atomic&split=train&where=event='a'", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/filter?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation&where=question_id=4", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } }, "/parquet": { "get": { "summary": "List of parquet files", "description": "The dataset is converted to the parquet format. The endpoint gives the list of the parquet files. The same split can have multiple parquet files, called shards. They are sorted by their shard index.", "externalDocs": { "description": "See Parquet (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/parquet" }, "operationId": "listParquetFiles", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "A list of parquet files.
Beware: the response is not paginated.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ParquetResponse" }, "examples": { "ibm/duorc": { "summary": "ibm/duorc: six parquet files, one per split", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=ibm/duorc", "value": { "parquet_files": [ { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "test", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/test/0000.parquet", "filename": "duorc-test.parquet", "size": 6136591 }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "train", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/train/0000.parquet", "filename": "duorc-train.parquet", "size": 26005668 }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "validation", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/validation/0000.parquet", "filename": "duorc-validation.parquet", "size": 5566868 }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "test", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/test/0000.parquet", "filename": "duorc-test.parquet", "size": 3035736 }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "train", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/train/0000.parquet", "filename": "duorc-train.parquet", "size": 14851720 }, { "dataset": "ibm/duorc", "config": "SelfRC", "split": "validation", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/SelfRC/validation/0000.parquet", "filename": "duorc-validation.parquet", "size": 3114390 } ], "pending": [], "failed": [], "partial": false } }, "duorc with ParaphraseRC config": { "summary": "duorc: three parquet files for ParaphraseRC, one per split", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=ibm/duorc&config=ParaphraseRC", "value": { "parquet_files": [ { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "test", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/test/0000.parquet", "filename": "duorc-test.parquet", "size": 6136591 }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "train", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/train/0000.parquet", "filename": "duorc-train.parquet", "size": 26005668 }, { "dataset": "ibm/duorc", "config": "ParaphraseRC", "split": "validation", "url": "https://huggingface.co/datasets/ibm/duorc/resolve/refs%2Fconvert%2Fparquet/ParaphraseRC/validation/0000.parquet", "filename": "duorc-validation.parquet", "size": 5566868 } ], "features": { "plot_id": { "dtype": "string", "_type": "Value" }, "plot": { "dtype": "string", "_type": "Value" }, "title": { "dtype": "string", "_type": "Value" }, "question_id": { "dtype": "string", "_type": "Value" }, "question": { "dtype": "string", "_type": "Value" }, "answers": { "feature": { "dtype": "string", "_type": "Value" }, "_type": "List" }, "no_answer": { "dtype": "bool", "_type": "Value" } }, "partial": false } }, "sharded parquet files": { "summary": "alexandrainst/da-wit: the parquet file for the train split is partitioned into 17 shards", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=alexandrainst/da-wit", "value": { "parquet_files": [ { "dataset": "alexandrainst/da-wit", "config": "default", "split": "test", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/test/0000.parquet", "filename": "parquet-test.parquet", "size": 48684227 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet", "filename": "parquet-train-00000-of-00017.parquet", "size": 465549291 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0001.parquet", "filename": "parquet-train-00001-of-00017.parquet", "size": 465701535 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0002.parquet", "filename": "parquet-train-00002-of-00017.parquet", "size": 463857123 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0003.parquet", "filename": "parquet-train-00003-of-00017.parquet", "size": 456197486 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0004.parquet", "filename": "parquet-train-00004-of-00017.parquet", "size": 465412051 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0005.parquet", "filename": "parquet-train-00005-of-00017.parquet", "size": 469114305 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0006.parquet", "filename": "parquet-train-00006-of-00017.parquet", "size": 460338645 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0007.parquet", "filename": "parquet-train-00007-of-00017.parquet", "size": 468309376 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0008.parquet", "filename": "parquet-train-00008-of-00017.parquet", "size": 490063121 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0009.parquet", "filename": "parquet-train-00009-of-00017.parquet", "size": 460462764 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0010.parquet", "filename": "parquet-train-00010-of-00017.parquet", "size": 476525998 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0011.parquet", "filename": "parquet-train-00011-of-00017.parquet", "size": 470327354 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0012.parquet", "filename": "parquet-train-00012-of-00017.parquet", "size": 457138334 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0013.parquet", "filename": "parquet-train-00013-of-00017.parquet", "size": 464485292 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0014.parquet", "filename": "parquet-train-00014-of-00017.parquet", "size": 466549376 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0015.parquet", "filename": "parquet-train-00015-of-00017.parquet", "size": 460452174 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "train", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/train/0016.parquet", "filename": "parquet-train-00016-of-00017.parquet", "size": 480583533 }, { "dataset": "alexandrainst/da-wit", "config": "default", "split": "val", "url": "https://huggingface.co/datasets/alexandrainst/da-wit/resolve/refs%2Fconvert%2Fparquet/default/val/0000.parquet", "filename": "parquet-val.parquet", "size": 11434278 } ], "pending": [], "failed": [], "partial": false } }, "partial parquet export": { "summary": "allenai/c4 (en): the parquet export is partial (first 5GB)", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/c4&config=en", "value": { "parquet_files": [ { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0000.parquet", "filename": "0000.parquet", "size": 312302655 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0001.parquet", "filename": "0001.parquet", "size": 314250060 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0002.parquet", "filename": "0002.parquet", "size": 312268050 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0003.parquet", "filename": "0003.parquet", "size": 312065965 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0004.parquet", "filename": "0004.parquet", "size": 308599130 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0005.parquet", "filename": "0005.parquet", "size": 312308752 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0006.parquet", "filename": "0006.parquet", "size": 313118966 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0007.parquet", "filename": "0007.parquet", "size": 313275039 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0008.parquet", "filename": "0008.parquet", "size": 312402829 }, { "dataset": "allenai/c4", "config": "en", "split": "train", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-train/0009.parquet", "filename": "0009.parquet", "size": 273854946 }, { "dataset": "allenai/c4", "config": "en", "split": "validation", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0000.parquet", "filename": "0000.parquet", "size": 311994499 }, { "dataset": "allenai/c4", "config": "en", "split": "validation", "url": "https://huggingface.co/datasets/allenai/c4/resolve/refs%2Fconvert%2Fparquet/en/partial-validation/0001.parquet", "filename": "0001.parquet", "size": 197281279 } ], "features": { "text": { "dtype": "string", "_type": "Value" }, "timestamp": { "dtype": "string", "_type": "Value" }, "url": { "dtype": "string", "_type": "Value" } }, "partial": true } }, "dataset where no parquet file could be created": { "summary": "When the parquet files cannot be created for a configuration, it's listed in 'failed'.", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/atomic", "value": { "parquet_files": [], "pending": [], "failed": [ { "kind": "config-info", "dataset": "allenai/atomic", "config": "atomic", "split": null } ], "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfig404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=allenai/atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/parquet?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } }, "/is-valid": { "get": { "summary": "Check if a dataset is valid", "description": "Returns the capabilities of the dataset: show a preview of the 100 first rows, show the viewer for all the rows, search/filter the rows, have statistics. Use the optional config and split parameters to filter the response.", "externalDocs": { "description": "See Valid datasets (Hub docs)", "url": "https://huggingface.co/docs/dataset-viewer/valid" }, "operationId": "isValidDataset", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" }, { "$ref": "#/components/parameters/OptionalSplit" } ], "responses": { "200": { "description": "The capabilities of the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/IsValidResponse" }, "examples": { "all the capabilities": { "summary": "valid dataset", "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=nyu-mll/glue", "value": { "preview": true, "viewer": true, "search": true, "filter": true, "statistics": true } }, "only preview": { "summary": "dataset with only preview", "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=ehartford/dolphin", "value": { "preview": true, "viewer": false, "search": false, "filter": false, "statistics": false } }, "no capabilities": { "summary": "dataset with no capabilities", "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=allenai/atomic", "value": { "preview": false, "viewer": false, "search": false, "filter": false, "statistics": false } }, "all the capabilities, for a config": { "summary": "valid config", "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=nyu-mll/glue&config=ax", "value": { "preview": true, "viewer": true, "search": true, "filter": true, "statistics": true } }, "all the capabilities, for a split": { "summary": "valid split", "description": "Try with https://datasets-server.huggingface.co/is-valid?dataset=nyu-mll/glue&config=ax&split=test", "value": { "preview": true, "viewer": true, "search": true, "filter": true, "statistics": true } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/Dataset404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-is-valid" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } } } } }, "/info": { "get": { "summary": "Get the metadata of a dataset.", "description": "Returns the metadata of the dataset: description, homepage, features, etc. Use the optional config parameter to filter the response on a subset.", "externalDocs": { "description": "The response is a dump of the DatasetInfo object from the datasets library", "url": "https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetInfo" }, "operationId": "getInfo", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "The metadata of the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/InfoResponse" }, "examples": { "dataset metadata": { "summary": "metadata of a dataset. It's an object, with one key per subset", "description": "Try with https://datasets-server.huggingface.co/info?dataset=ylecun/mnist", "value": { "dataset_info": { "ylecun/mnist": { "description": "The MNIST dataset consists of 70,000 28x28 black-and-white images in 10 classes (one for each digits), with 7,000\nimages per class. There are 60,000 training images and 10,000 test images.\n", "citation": "@article{lecun2010mnist,\n title={MNIST handwritten digit database},\n author={LeCun, Yann and Cortes, Corinna and Burges, CJ},\n journal={ATT Labs [Online]. Available: http://yann.lecun.com/exdb/mnist},\n volume={2},\n year={2010}\n}\n", "homepage": "http://yann.lecun.com/exdb/mnist/", "license": "", "features": { "image": { "_type": "Image" }, "label": { "names": [ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9" ], "_type": "ClassLabel" } }, "supervised_keys": { "input": "image", "output": "label" }, "task_templates": [ { "task": "image-classification", "label_column": "label" } ], "builder_name": "ylecun/mnist", "config_name": "mnist", "version": { "version_str": "1.0.0", "major": 1, "minor": 0, "patch": 0 }, "splits": { "train": { "name": "train", "num_bytes": 17471100, "num_examples": 60000, "dataset_name": "ylecun/mnist" }, "test": { "name": "test", "num_bytes": 2916482, "num_examples": 10000, "dataset_name": "ylecun/mnist" } }, "download_checksums": { "https://storage.googleapis.com/cvdf-datasets/mnist/train-images-idx3-ubyte.gz": { "num_bytes": 9912422, "checksum": null }, "https://storage.googleapis.com/cvdf-datasets/mnist/train-labels-idx1-ubyte.gz": { "num_bytes": 28881, "checksum": null }, "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-images-idx3-ubyte.gz": { "num_bytes": 1648877, "checksum": null }, "https://storage.googleapis.com/cvdf-datasets/mnist/t10k-labels-idx1-ubyte.gz": { "num_bytes": 4542, "checksum": null } }, "download_size": 11594722, "dataset_size": 20387582, "size_in_bytes": 31982304 } }, "pending": [], "failed": [], "partial": false } }, "config metadata": { "summary": "metadata for a dataset subset", "description": "Try with https://datasets-server.huggingface.co/info?dataset=nyu-mll/glue&config=ax", "value": { "dataset_info": { "description": "", "citation": "", "homepage": "", "license": "", "features": { "premise": { "dtype": "string", "_type": "Value" }, "hypothesis": { "dtype": "string", "_type": "Value" }, "label": { "names": ["entailment", "neutral", "contradiction"], "_type": "ClassLabel" }, "idx": { "dtype": "int32", "_type": "Value" } }, "builder_name": "parquet", "dataset_name": "glue", "config_name": "ax", "version": { "version_str": "0.0.0", "major": 0, "minor": 0, "patch": 0 }, "splits": { "test": { "name": "test", "num_bytes": 243791, "num_examples": 1104, "dataset_name": null } }, "download_size": 80767, "dataset_size": 243791 }, "partial": false } }, "dataset metadata with failed subsets": { "summary": "metadata of a dataset which has failed subsets. The failed subsets are listed in 'failed'.", "description": "Try with https://datasets-server.huggingface.co/info?dataset=allenai/atomic", "value": { "dataset_info": {}, "pending": [], "failed": [ { "kind": "config-info", "dataset": "allenai/atomic", "config": "atomic", "split": null } ], "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfig404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/info?dataset=allenai/atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/info?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } }, "/size": { "get": { "summary": "Get the size of a dataset.", "description": "Returns the size (number of rows, storage) of the dataset. Use the optional config parameter to filter the response on a subset.", "externalDocs": { "description": "See size in the Hub docs.", "url": "https://huggingface.co/docs/dataset-viewer/size" }, "operationId": "getSize", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" } ], "responses": { "200": { "description": "The size of the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/SizeResponse" }, "examples": { "dataset size": { "summary": "size of a dataset.", "description": "Try with https://datasets-server.huggingface.co/size?dataset=ylecun/mnist", "value": { "size": { "dataset": { "dataset": "ylecun/mnist", "num_bytes_original_files": 11594722, "num_bytes_parquet_files": 18157506, "num_bytes_memory": 20387582, "num_rows": 70000, "estimated_num_rows": null }, "configs": [ { "dataset": "ylecun/mnist", "config": "mnist", "num_bytes_original_files": 11594722, "num_bytes_parquet_files": 18157506, "num_bytes_memory": 20387582, "num_rows": 70000, "num_columns": 2, "estimated_num_rows": null } ], "splits": [ { "dataset": "ylecun/mnist", "config": "mnist", "split": "train", "num_bytes_parquet_files": 15561616, "num_bytes_memory": 17471100, "num_rows": 60000, "num_columns": 2, "estimated_num_rows": null }, { "dataset": "ylecun/mnist", "config": "mnist", "split": "test", "num_bytes_parquet_files": 2595890, "num_bytes_memory": 2916482, "num_rows": 10000, "num_columns": 2, "estimated_num_rows": null } ] }, "pending": [], "failed": [], "partial": false } }, "config size": { "summary": "size of a dataset subset", "description": "Try with https://datasets-server.huggingface.co/size?dataset=nyu-mll/glue&config=ax", "value": { "size": { "config": { "dataset": "nyu-mll/glue", "config": "ax", "num_bytes_original_files": 222257, "num_bytes_parquet_files": 80767, "num_bytes_memory": 237694, "num_rows": 1104, "num_columns": 4, "estimated_num_rows": null }, "splits": [ { "dataset": "nyu-mll/glue", "config": "ax", "split": "test", "num_bytes_parquet_files": 80767, "num_bytes_memory": 237694, "num_rows": 1104, "num_columns": 4, "estimated_num_rows": null } ] }, "partial": false } }, "dataset size with failed subsets": { "summary": "size of a dataset which has failed subsets. The failed subsets are listed in 'failed'.", "description": "Try with https://datasets-server.huggingface.co/size?dataset=allenai/atomic", "value": { "size": { "dataset": { "dataset": "allenai/atomic", "num_bytes_original_files": 0, "num_bytes_parquet_files": 0, "num_bytes_memory": 0, "num_rows": 0, "estimated_num_rows": null }, "configs": [], "splits": [] }, "pending": [], "failed": [ { "kind": "config-size", "dataset": "allenai/atomic", "config": "atomic", "split": null } ], "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfig404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/size?dataset=allenai/atomic&config=atomic", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 506, in raise_if_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 402, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/size?dataset=echarlaix/vqa-lxmert&config=vqa", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } }, "/opt-in-out-urls": { "get": { "summary": "Get the number of opted-in and opted-out image URLs in a dataset.", "description": "Based on the API of spawning.ai, returns the number of image URLs that have been opted-in and opted-out. Use the optional config and split parameters to filter the response. Only a sample of the rows is scanned, the first 100K rows at the moment.", "externalDocs": { "description": "See spawning.io (Hub docs). The doc is still missing for the endpoint, see https://github.com/huggingface/dataset-viewer/issues/1664.", "url": "https://huggingface.co/docs/dataset-viewer/" }, "operationId": "getOptInOutUrls", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/OptionalConfig" }, { "$ref": "#/components/parameters/OptionalSplit" } ], "responses": { "200": { "description": "The number of opted-in and opted-out image URLS in the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/OptInOutUrlsCountResponse" }, "examples": { "number of URLS for a dataset": { "summary": "number of URLs for a dataset.", "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, "num_opt_in_urls": 0, "num_opt_out_urls": 54760, "num_scanned_rows": 215840, "num_urls": 215840, "full_scan": false } }, "number of URLS for a subset": { "summary": "number of URLs for a subset.", "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions&config=labeled", "value": { "urls_columns": ["image_url"], "has_urls_columns": true, "num_opt_in_urls": 0, "num_opt_out_urls": 16579, "num_scanned_rows": 100000, "num_urls": 100000, "full_scan": false } }, "number of URLS for a split": { "summary": "number of URLs for a split.", "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=google-research-datasets/conceptual_captions&config=labeled&split=train", "value": { "has_urls_columns": true, "num_opt_in_urls": 0, "num_opt_out_urls": 16579, "num_scanned_rows": 100000, "num_urls": 100000, "urls_columns": ["image_url"], "full_scan": false } }, "dataset that has no image URLs columns": { "summary": "no image URLs columns: values are zero.", "description": "Try with https://datasets-server.huggingface.co/opt-in-out-urls?dataset=ylecun/mnist", "value": { "urls_columns": [], "has_urls_columns": false, "num_opt_in_urls": 0, "num_opt_out_urls": 0, "num_scanned_rows": 0, "num_urls": 0, "full_scan": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": {} } } } } } }, "/presidio-entities": { "get": { "summary": "Get the number of rows containing Presidio entities in a dataset.", "description": "Based on Presidio, returns the number of rows containing names, emails, phone numbers of sensitive PII. Only a sample of the rows is scanned, the first 10K rows at the moment.", "externalDocs": { "description": "See https://microsoft.github.io/presidio/. The Hub docs are still missing for the endpoint, see https://github.com/huggingface/dataset-viewer/issues/1664.", "url": "https://huggingface.co/docs/dataset-viewer/" }, "operationId": "getPresidioEntities", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" } ], "responses": { "200": { "description": "The number of Presidio entities in the dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/PresidioEntitiesCountResponse" }, "examples": { "number of URLS for a dataset": { "summary": "number of entities for a dataset.", "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=lhoestq/fake_name_and_ssn", "value": { "scanned_columns": ["fake_name", "fake_ssn"], "num_rows_with_person_entities": 3, "num_rows_with_phone_number_entities": 0, "num_rows_with_email_address_entities": 0, "num_rows_with_sensitive_pii": 2, "num_scanned_rows": 3, "has_scanned_columns": false, "full_scan": true } }, "dataset that has no image URLs columns": { "summary": "no scanned columns: values are zero.", "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=ylecun/mnist", "value": { "scanned_columns": [], "num_rows_with_person_entities": 0, "num_rows_with_phone_number_entities": 0, "num_rows_with_email_address_entities": 0, "num_rows_with_sensitive_pii": 0, "num_scanned_rows": 0, "has_scanned_columns": false, "full_scan": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "$ref": "#/components/responses/Dataset422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature or Presidio is not enabled on this dataset.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": {} } } } } } }, "/statistics": { "get": { "summary": "Descriptive statistics of a split's columns", "description": "Returns descriptive statistics, such as min, max, average, histogram, of the columns of a split.", "externalDocs": { "description": "See statistics (Hub docs).", "url": "https://huggingface.co/docs/dataset-viewer/statistics" }, "operationId": "getStatistics", "security": [ {}, { "AuthorizationHuggingFaceApiToken": [] }, { "AuthorizationHuggingFaceJWT": [] } ], "parameters": [ { "$ref": "#/components/parameters/RequiredDataset" }, { "$ref": "#/components/parameters/RequiredConfig" }, { "$ref": "#/components/parameters/RequiredSplit" } ], "responses": { "200": { "description": "The descriptive statistics for the columns of the split.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/StatisticsResponse" }, "examples": { "A split (mstz/wine) with numeric columns": { "summary": "Statistics on numeric columns.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=mstz/wine&config=wine&split=train.", "value": { "num_examples": 6497, "statistics": [ { "column_name": "alcohol", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 8.0, "max": 14.9, "mean": 10.4918, "median": 10.3, "std": 1.19271, "histogram": { "hist": [ 40, 1133, 1662, 1156, 1092, 628, 569, 175, 41, 1 ], "bin_edges": [ 8.0, 8.69, 9.38, 10.07, 10.76, 11.45, 12.14, 12.83, 13.52, 14.21, 14.9 ] } } }, { "column_name": "chlorides", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.009, "max": 0.611, "mean": 0.05603, "median": 0.047, "std": 0.03503, "histogram": { "hist": [5061, 1279, 92, 34, 8, 9, 10, 2, 0, 2], "bin_edges": [ 0.009, 0.0692, 0.1294, 0.1896, 0.2498, 0.31, 0.3702, 0.4304, 0.4906, 0.5508, 0.611 ] } } }, { "column_name": "citric_acid", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.0, "max": 1.66, "mean": 0.31863, "median": 0.31, "std": 0.14532, "histogram": { "hist": [ 766, 3113, 2059, 420, 126, 5, 6, 1, 0, 1 ], "bin_edges": [ 0.0, 0.166, 0.332, 0.498, 0.664, 0.83, 0.996, 1.162, 1.328, 1.494, 1.66 ] } } }, { "column_name": "density", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.98711, "max": 1.03898, "mean": 0.9947, "median": 0.99489, "std": 0.003, "histogram": { "hist": [1599, 3645, 1241, 9, 2, 0, 0, 0, 0, 1], "bin_edges": [ 0.98711, 0.9923, 0.99748, 1.00267, 1.00786, 1.01304, 1.01823, 1.02342, 1.02861, 1.03379, 1.03898 ] } } }, { "column_name": "fixed_acidity", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 3.8, "max": 15.9, "mean": 7.21531, "median": 7.0, "std": 1.29643, "histogram": { "hist": [ 63, 1151, 3248, 1339, 382, 177, 82, 41, 7, 7 ], "bin_edges": [ 3.8, 5.01, 6.22, 7.43, 8.64, 9.85, 11.06, 12.27, 13.48, 14.69, 15.9 ] } } }, { "column_name": "free_sulfur_dioxide", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 1.0, "max": 289.0, "mean": 30.52532, "median": 29.0, "std": 17.7494, "histogram": { "hist": [3392, 2676, 401, 20, 6, 1, 0, 0, 0, 1], "bin_edges": [ 1.0, 29.8, 58.6, 87.4, 116.2, 145.0, 173.8, 202.6, 231.4, 260.2, 289.0 ] } } }, { "column_name": "is_red", "column_type": "class_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "n_unique": 2, "frequencies": { "red": 1599, "white": 4898 } } }, { "column_name": "pH", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 2.72, "max": 4.01, "mean": 3.2185, "median": 3.21, "std": 0.16079, "histogram": { "hist": [ 16, 334, 1233, 2111, 1663, 802, 263, 59, 12, 4 ], "bin_edges": [ 2.72, 2.849, 2.978, 3.107, 3.236, 3.365, 3.494, 3.623, 3.752, 3.881, 4.01 ] } } }, { "column_name": "quality", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 3, "max": 9, "mean": 5.81838, "median": 6.0, "std": 0.87326, "histogram": { "hist": [30, 216, 2138, 2836, 1079, 193, 5], "bin_edges": [3, 4, 5, 6, 7, 8, 9, 9] } } }, { "column_name": "residual_sugar", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.6, "max": 65.8, "mean": 5.44324, "median": 3.0, "std": 4.7578, "histogram": { "hist": [4551, 1396, 533, 14, 2, 0, 0, 0, 0, 1], "bin_edges": [ 0.6, 7.12, 13.64, 20.16, 26.68, 33.2, 39.72, 46.24, 52.76, 59.28, 65.8 ] } } }, { "column_name": "sulphates", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.22, "max": 2.0, "mean": 0.53127, "median": 0.51, "std": 0.14881, "histogram": { "hist": [ 1023, 3451, 1540, 382, 66, 21, 6, 4, 0, 4 ], "bin_edges": [ 0.22, 0.398, 0.576, 0.754, 0.932, 1.11, 1.288, 1.466, 1.644, 1.822, 2.0 ] } } }, { "column_name": "total_sulfur_dioxide", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 6.0, "max": 440.0, "mean": 115.74457, "median": 118.0, "std": 56.52185, "histogram": { "hist": [ 1088, 979, 2049, 1514, 721, 134, 8, 2, 1, 1 ], "bin_edges": [ 6.0, 49.4, 92.8, 136.2, 179.6, 223.0, 266.4, 309.8, 353.2, 396.6, 440.0 ] } } }, { "column_name": "volatile_acidity", "column_type": "float", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.08, "max": 1.58, "mean": 0.33967, "median": 0.29, "std": 0.16464, "histogram": { "hist": [ 1580, 3002, 996, 606, 214, 70, 22, 4, 2, 1 ], "bin_edges": [ 0.08, 0.23, 0.38, 0.53, 0.68, 0.83, 0.98, 1.13, 1.28, 1.43, 1.58 ] } } } ], "partial": false } }, "A split (ylecun/mnist) with a label column": { "summary": "Statistics on a class label column. The image column is not processed.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=ylecun/mnist&config=mnist&split=train.", "value": { "num_examples": 60000, "statistics": [ { "column_name": "label", "column_type": "class_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "n_unique": 10, "frequencies": { "0": 5923, "1": 6742, "2": 5958, "3": 6131, "4": 5842, "5": 5421, "6": 5918, "7": 6265, "8": 5851, "9": 5949 } } } ], "partial": false } }, "A split (CL-ETM/datetimeevents) with a datetime column": { "summary": "Statistics on a split with datetime columns 'charttime', 'storetime' and 'value'. ", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=CL-ETM/datetimeevents&config=mnist&split=train.", "value": { "num_examples": 6653174, "statistics": [ { "column_name": "caregiver_id", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 45, "max": 99872, "mean": 49146.20367, "median": 46354.0, "std": 28893.09204, "histogram": { "hist": [ 586864, 696061, 882127, 627295, 759981, 594546, 544977, 653948, 507192, 800183 ], "bin_edges": [ 45, 10028, 20011, 29994, 39977, 49960, 59943, 69926, 79909, 89892, 99872 ] } } }, { "column_name": "charttime", "column_type": "datetime", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": "2110-01-13 09:39:00", "max": "2214-07-26 08:00:00", "mean": "2153-03-20 23:15:24", "median": "2153-01-19 04:19:30", "std": "8691 days, 20:22:21.464930", "histogram": { "hist": [ 644662, 824869, 883173, 884980, 861445, 863916, 838647, 664347, 156213, 30922 ], "bin_edges": [ "2110-01-13 09:39:00", "2120-06-27 07:05:07", "2130-12-10 04:31:14", "2141-05-24 01:57:21", "2151-11-05 23:23:28", "2162-04-19 20:49:35", "2172-10-01 18:15:42", "2183-03-16 15:41:49", "2193-08-28 13:07:56", "2204-02-11 10:34:03", "2214-07-26 08:00:00" ] } } }, { "column_name": "hadm_id", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 20000094, "max": 29999828, "mean": 25027899.88926, "median": 25052613.0, "std": 2869146.55704, "histogram": { "hist": [ 638196, 656157, 656168, 661133, 678335, 693220, 676587, 653053, 674626, 665699 ], "bin_edges": [ 20000094, 21000068, 22000042, 23000016, 23999990, 24999964, 25999938, 26999912, 27999886, 28999860, 29999828 ] } } }, { "column_name": "itemid", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 224183, "max": 230120, "mean": 225487.4805, "median": 224290.0, "std": 1820.04267, "histogram": { "hist": [ 3742726, 568047, 1012645, 75427, 21011, 41780, 311155, 100074, 249544, 530765 ], "bin_edges": [ 224183, 224777, 225371, 225965, 226559, 227153, 227747, 228341, 228935, 229529, 230120 ] } } }, { "column_name": "stay_id", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 30000153, "max": 39999858, "mean": 34988877.57506, "median": 34997302.0, "std": 2873138.27766, "histogram": { "hist": [ 669019, 638622, 695479, 665010, 659205, 659496, 696313, 662500, 671230, 636300 ], "bin_edges": [ 30000153, 31000124, 32000095, 33000066, 34000037, 35000008, 35999979, 36999950, 37999921, 38999892, 39999858 ] } } }, { "column_name": "storetime", "column_type": "datetime", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": "2110-01-13 13:13:00", "max": "2214-07-26 09:20:00", "mean": "2153-03-20 23:57:17", "median": "2153-01-19 03:42:00", "std": "8691 days, 20:22:32.902370", "histogram": { "hist": [ 644728, 824803, 883215, 884951, 861438, 863915, 838652, 664336, 156214, 30922 ], "bin_edges": [ "2110-01-13 13:13:00", "2120-06-27 10:25:43", "2130-12-10 07:38:26", "2141-05-24 04:51:09", "2151-11-06 02:03:52", "2162-04-19 23:16:35", "2172-10-01 20:29:18", "2183-03-16 17:42:01", "2193-08-28 14:54:44", "2204-02-11 12:07:27", "2214-07-26 09:20:00" ] } } }, { "column_name": "subject_id", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 10000032, "max": 16657691, "mean": 13340551.62433, "median": 13334004.0, "std": 1927957.39956, "histogram": { "hist": [ 638347, 684908, 691450, 631212, 672810, 659625, 641987, 654011, 702989, 675835 ], "bin_edges": [ 10000032, 10665798, 11331564, 11997330, 12663096, 13328862, 13994628, 14660394, 15326160, 15991926, 16657691 ] } } }, { "column_name": "value", "column_type": "datetime", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": "2109-08-02 00:00:00", "max": "2214-07-24 09:57:00", "mean": "2153-03-17 00:32:04", "median": "2153-01-15 00:00:00", "std": "8691 days, 20:07:56.642090", "histogram": { "hist": [ 611811, 820557, 897262, 880309, 876200, 860348, 845238, 673106, 157352, 30991 ], "bin_edges": [ "2109-08-02 00:00:00", "2120-01-31 03:23:43", "2130-07-31 06:47:26", "2141-01-28 10:11:09", "2151-07-29 13:34:52", "2162-01-26 16:58:35", "2172-07-26 20:22:18", "2183-01-24 23:46:01", "2193-07-25 03:09:44", "2204-01-24 06:33:27", "2214-07-24 09:57:00" ] } } }, { "column_name": "valueuom", "column_type": "string_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "no_label_count": 0, "no_label_proportion": 0.0, "n_unique": 2, "frequencies": { "Date and Time": 1885855, "Date": 4767319 } } }, { "column_name": "warning", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0, "max": 1, "mean": 0.00028, "median": 0.0, "std": 0.01674, "histogram": { "hist": [ 6651308, 1866 ], "bin_edges": [ 0, 1, 1 ] } } } ], "partial": true } }, "A split (nyu-mll/glue) with a string (text) column": { "summary": "Statistics on a string column. The column 'hypothesis' contains more than 30 different strings, so the statistics are a histogram of the string lengths.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=nyu-mll/glue&config=ax&split=test.", "value": { "num_examples": 1104, "statistics": [ { "column_name": "hypothesis", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 11, "max": 296, "mean": 97.46649, "median": 88.0, "std": 55.82714, "histogram": { "hist": [ 171, 224, 235, 180, 102, 99, 53, 28, 10, 2 ], "bin_edges": [ 11, 40, 69, 98, 127, 156, 185, 214, 243, 272, 296 ] } } }, { "column_name": "idx", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0, "max": 1103, "mean": 551.5, "median": 551.5, "std": 318.84165, "histogram": { "hist": [ 111, 111, 111, 111, 111, 111, 111, 111, 111, 105 ], "bin_edges": [ 0, 111, 222, 333, 444, 555, 666, 777, 888, 999, 1103 ] } } }, { "column_name": "label", "column_type": "class_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "no_label_count": 1104, "no_label_proportion": 1.0, "n_unique": 3, "frequencies": { "entailment": 0, "neutral": 0, "contradiction": 0 } } }, { "column_name": "premise", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 11, "max": 296, "mean": 97.46649, "median": 88.0, "std": 55.82714, "histogram": { "hist": [ 171, 224, 235, 180, 102, 99, 53, 28, 10, 2 ], "bin_edges": [ 11, 40, 69, 98, 127, 156, 185, 214, 243, 272, 296 ] } } } ], "partial": false } }, "A split (Rowan/hellaswag) with a string (label) column": { "summary": "Statistics on a string column. The column 'label' contains less than 30 different strings, so each string is considered as a label, and the statistics are a count per label.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=Rowan/hellaswag&config=default&split=train.", "value": { "num_examples": 39905, "statistics": [ { "column_name": "activity_label", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 3, "max": 31, "mean": 15.7912, "median": 16.0, "std": 6.50536, "histogram": { "hist": [ 1564, 5973, 2467, 5844, 8075, 4732, 5382, 3965, 1825, 78 ], "bin_edges": [ 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 31 ] } } }, { "column_name": "ctx", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 29, "max": 524, "mean": 217.86533, "median": 231.0, "std": 95.94445, "histogram": { "hist": [ 3221, 6658, 4573, 5085, 8060, 7375, 3652, 1092, 169, 20 ], "bin_edges": [ 29, 79, 129, 179, 229, 279, 329, 379, 429, 479, 524 ] } } }, { "column_name": "ctx_a", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 1, "max": 524, "mean": 214.18228, "median": 231.0, "std": 99.93323, "histogram": { "hist": [ 1987, 6198, 5422, 4068, 7695, 8395, 4549, 1356, 215, 20 ], "bin_edges": [ 1, 54, 107, 160, 213, 266, 319, 372, 425, 478, 524 ] } } }, { "column_name": "ctx_b", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0, "max": 140, "mean": 3.31367, "median": 0.0, "std": 6.83284, "histogram": { "hist": [ 37877, 1518, 347, 107, 36, 15, 4, 0, 0, 1 ], "bin_edges": [ 0, 15, 30, 45, 60, 75, 90, 105, 120, 135, 140 ] } } }, { "column_name": "ind", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 2, "max": 50674, "mean": 25408.91299, "median": 25533.0, "std": 14670.02268, "histogram": { "hist": [ 4034, 4023, 3891, 3887, 3975, 3845, 4131, 4062, 4089, 3968 ], "bin_edges": [ 2, 5070, 10138, 15206, 20274, 25342, 30410, 35478, 40546, 45614, 50674 ] } } }, { "column_name": "label", "column_type": "string_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "n_unique": 4, "frequencies": { "3": 10021, "2": 9867, "0": 9986, "1": 10031 } } }, { "column_name": "source_id", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 9, "max": 25, "mean": 17.35021, "median": 13.0, "std": 5.86238, "histogram": { "hist": [31, 2927, 22207, 0, 0, 0, 0, 0, 14740], "bin_edges": [ 9, 11, 13, 15, 17, 19, 21, 23, 25, 25 ] } } }, { "column_name": "split", "column_type": "string_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "n_unique": 1, "frequencies": { "train": 39905 } } }, { "column_name": "split_type", "column_type": "string_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "n_unique": 1, "frequencies": { "indomain": 39905 } } } ], "partial": false } }, "A split (google/boolq) with a boolean column": { "summary": "Statistics on a boolean column 'answer'.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=google/boolq&config=default&split=train.", "value": { "num_examples": 9427, "statistics": [ { "column_name": "answer", "column_type": "bool", "column_statistics": { "nan_count": 0, "nan_proportion": 0, "frequencies": { "True": 5874, "False": 3553 } } }, { "column_name": "passage", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0, "min": 35, "max": 4720, "mean": 565.61303, "median": 511, "std": 323.1375, "histogram": { "hist": [4598, 3939, 721, 133, 20, 7, 5, 3, 0, 1], "bin_edges": [ 35, 504, 973, 1442, 1911, 2380, 2849, 3318, 3787, 4256, 4720 ] } } }, { "column_name": "question", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0, "min": 20, "max": 100, "mean": 43.99194, "median": 42, "std": 8.85434, "histogram": { "hist": [ 43, 1914, 4636, 2046, 501, 159, 68, 34, 26 ], "bin_edges": [ 20, 29, 38, 47, 56, 65, 74, 83, 92, 100 ] } } } ] } }, "A split (MLCommons/peoples_speech) with audio column": { "summary": "Statistics on an audio column 'audio'.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=MLCommons/peoples_speech&config=validation&split=validation.", "value": { "num_examples": 18622, "statistics": [ { "column_name": "audio", "column_type": "audio", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 0.653, "max": 105.97, "mean": 6.41103, "median": 4.8815, "std": 5.63269, "histogram": { "hist": [15867, 2319, 350, 67, 12, 5, 0, 1, 0, 1], "bin_edges": [ 0.653, 11.1847, 21.7164, 32.2481, 42.7798, 53.3115, 63.8432, 74.3749, 84.9066, 95.4383, 105.97 ] } } }, { "column_name": "duration_ms", "column_type": "int", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 833, "max": 105970, "mean": 6411.06079, "median": 4881.5, "std": 5632.67057, "histogram": { "hist": [15950, 2244, 345, 64, 12, 5, 0, 1, 0, 1], "bin_edges": [ 833, 11347, 21861, 32375, 42889, 53403, 63917, 74431, 84945, 95459, 105970 ] } } }, { "column_name": "id", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 43, "max": 197, "mean": 120.06675, "median": 136.0, "std": 44.49607, "histogram": { "hist": [ 3599, 939, 278, 1914, 1838, 1646, 4470, 1443, 1976, 519 ], "bin_edges": [ 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 197 ] } } }, { "column_name": "text", "column_type": "string_text", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 1, "max": 1219, "mean": 94.52873, "median": 75.0, "std": 79.11078, "histogram": { "hist": [ 13703, 3975, 744, 146, 36, 10, 5, 1, 1, 1 ], "bin_edges": [ 1, 123, 245, 367, 489, 611, 733, 855, 977, 1099, 1219 ] } } } ], "partial": false } }, "A split (Matthijs/snacks) with image column": { "summary": "Statistics on an image column 'image'.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=Matthijs/snacks&config=default&split=train.", "value": { "num_examples": 4838, "statistics": [ { "column_name": "image", "column_type": "image", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "min": 256, "max": 873, "mean": 327.99339, "median": 341.0, "std": 60.07286, "histogram": { "hist": [ 1734, 1637, 1326, 121, 10, 3, 1, 3, 1, 2 ], "bin_edges": [ 256, 318, 380, 442, 504, 566, 628, 690, 752, 814, 873 ] } } }, { "column_name": "label", "column_type": "class_label", "column_statistics": { "nan_count": 0, "nan_proportion": 0.0, "no_label_count": 0, "no_label_proportion": 0.0, "n_unique": 20, "frequencies": { "apple": 250, "banana": 250, "cake": 249, "candy": 249, "carrot": 249, "cookie": 249, "doughnut": 250, "grape": 250, "hot dog": 250, "ice cream": 250, "juice": 250, "muffin": 250, "orange": 249, "pineapple": 260, "popcorn": 180, "pretzel": 154, "salad": 250, "strawberry": 249, "waffle": 250, "watermelon": 250 } } } ], "partial": false } } } } } }, "401": { "$ref": "#/components/responses/Common401" }, "404": { "$ref": "#/components/responses/DatasetConfigSplit404" }, "422": { "$ref": "#/components/responses/DatasetConfigSplit422" }, "500": { "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-500-common" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "error in the dataset itself": { "summary": "An error while processing the dataset prevents the response to be created.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=allenai/atomic&config=atomic&split=train", "value": { "error": "Couldn't get the size of external files in `_split_generators` because a request failed:\n404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\nPlease consider moving your data files in this dataset repository instead (e.g. inside a data/ folder).", "cause_exception": "HTTPError", "cause_message": "404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz", "cause_traceback": [ "Traceback (most recent call last):\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 497, in _is_too_big_from_external_data_files\n for i, size in enumerate(pool.imap_unordered(get_size, ext_data_files)):\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 870, in next\n raise value\n", " File \"/usr/local/lib/python3.9/multiprocessing/pool.py\", line 125, in worker\n result = (True, func(*args, **kwds))\n", " File \"/src/services/worker/src/worker/job_runners/config/parquet_and_info.py\", line 396, in _request_size\n response.raise_for_status()\n", " File \"/src/services/worker/.venv/lib/python3.9/site-packages/requests/models.py\", line 1021, in raise_for_status\n raise HTTPError(http_error_msg, response=self)\n", "requests.exceptions.HTTPError: 404 Client Error: Not Found for url: https://maartensap.com/atomic/data/atomic_data.tgz\n" ] } }, "response not ready": { "$ref": "#/components/examples/ResponseNotReadyError" }, "unexpected error": { "$ref": "#/components/examples/UnexpectedJsonError" } } }, "text/plain": { "schema": { "$ref": "#/components/schemas/ServerErrorResponse" }, "examples": { "internal server error": { "$ref": "#/components/examples/UnexpectedTextError" } } } } }, "501": { "description": "The server does not implement the feature.", "headers": { "Cache-Control": { "$ref": "#/components/headers/Cache-Control" }, "Access-Control-Allow-Origin": { "$ref": "#/components/headers/Access-Control-Allow-Origin" }, "X-Error-Code": { "$ref": "#/components/headers/X-Error-Code-501" } }, "content": { "application/json": { "schema": { "$ref": "#/components/schemas/CustomError" }, "examples": { "blocked dataset": { "summary": "The dataset is blocked manually on the server.", "description": "Try with https://datasets-server.huggingface.co/statistics?dataset=echarlaix/vqa-lxmert&config=vqa&split=validation", "value": { "error": "The parquet conversion has been disabled for this dataset for now. Please open an issue in https://github.com/huggingface/dataset-viewer if you want this dataset to be supported." } } } } } } } } } } }