spark.SparkDatasetManifest
export interface SparkDatasetManifest {
/** Project name */
name: string;
/** Project version (semantic versioning) */
version?: string;
/** Human-readable description */
description?: string;
/** Project author (usually torchjs.org username) */
author: string;
/** Project type: "dataset", "model", or "project" */
type?: 'dataset' | 'model' | 'project';
/**
* Dataset configuration.
*
* Required for dataset and project types.
*/
dataset?: {
/** Data format: "text", "image", etc. */
format?: string;
/**
* Dataset splits (train, test, val, etc.)
*
* For image classification: specify `images` and `labels` file paths
* For text: specify single `file` path
*/
splits: {
[split: string]: {
/** Single file path (for text datasets) */
file?: string;
/** Images file path (for image classification) */
images?: string;
/** Labels file path (for image classification) */
labels?: string;
/** Number of samples in this split */
samples?: number;
/** Number of tokens (for language models) */
tokens?: number;
};
};
/** Image dimensions [height, width] (for image datasets) */
image_size?: [number, number];
/** Number of classes (for classification) */
num_classes?: number;
/** Data type: "float32", "uint8", etc. */
dtype?: string;
/** Path to vocabulary file (for language models) */
vocabulary?: string;
/** Tokenizer type: "char", "bpe", etc. */
tokenizer?: string;
};
/**
* Model configuration.
*
* Required for model and project types.
*/
model?: {
/** Model architecture name */
architecture?: string;
/** Path to model weights file */
file: string;
/** Path to model configuration file */
config?: string;
/** Number of parameters */
parameters?: number;
/** Data type of weights */
dtype?: string;
};
/**
* Training configuration and results.
*
* Metadata about how the model was trained.
*/
training?: {
/** Number of training epochs */
epochs?: number;
/** Batch size used during training */
batch_size?: number;
/** Optimizer algorithm */
optimizer?: string;
/** Learning rate */
learning_rate?: number;
/** Final accuracy achieved */
final_accuracy?: number;
};
/**
* File manifest with checksums.
*
* Maps filenames to their size and SHA256 hash.
*/
files?: {
[filename: string]: {
/** File size in bytes */
size: number;
/** SHA256 hash of file contents */
sha256?: string;
};
};
}name(string)- – Project name
version(string)optional- – Project version (semantic versioning)
description(string)optional- – Human-readable description
author(string)- – Project author (usually torchjs.org username)
type('dataset' | 'model' | 'project')optional- – Project type: "dataset", "model", or "project"
dataset({ /** Data format: "text", "image", etc. */ format?: string; /** * Dataset splits (train, test, val, etc.) * * For image classification: specify `images` and `labels` file paths * For text: specify single `file` path */ splits: { [split: string]: { /** Single file path (for text datasets) */ file?: string; /** Images file path (for image classification) */ images?: string; /** Labels file path (for image classification) */ labels?: string; /** Number of samples in this split */ samples?: number; /** Number of tokens (for language models) */ tokens?: number; }; }; /** Image dimensions [height, width] (for image datasets) */ image_size?: [number, number]; /** Number of classes (for classification) */ num_classes?: number; /** Data type: "float32", "uint8", etc. */ dtype?: string; /** Path to vocabulary file (for language models) */ vocabulary?: string; /** Tokenizer type: "char", "bpe", etc. */ tokenizer?: string; })optional- – Dataset configuration. Required for dataset and project types.
model({ /** Model architecture name */ architecture?: string; /** Path to model weights file */ file: string; /** Path to model configuration file */ config?: string; /** Number of parameters */ parameters?: number; /** Data type of weights */ dtype?: string; })optional- – Model configuration. Required for model and project types.
training({ /** Number of training epochs */ epochs?: number; /** Batch size used during training */ batch_size?: number; /** Optimizer algorithm */ optimizer?: string; /** Learning rate */ learning_rate?: number; /** Final accuracy achieved */ final_accuracy?: number; })optional- – Training configuration and results. Metadata about how the model was trained.
files({ [filename: string]: { /** File size in bytes */ size: number; /** SHA256 hash of file contents */ sha256?: string; }; })optional- – File manifest with checksums. Maps filenames to their size and SHA256 hash.
torch.json manifest file for a dataset, model, or project.
This file describes the structure and metadata of a project on torchjs.org. It can describe a dataset-only project, a model-only project, or a full training project.
Examples
{
"name": "mnist",
"version": "1.0.0",
"description": "Handwritten digits dataset",
"author": "kasumi",
"type": "dataset",
"dataset": {
"splits": {
"train": {
"images": "data/train-images.bin",
"labels": "data/train-labels.bin",
"samples": 60000
},
"test": {
"images": "data/test-images.bin",
"labels": "data/test-labels.bin",
"samples": 10000
}
},
"image_size": [28, 28],
"num_classes": 10,
"dtype": "uint8"
}
}