scgpt.scbank.data 源代码
import json
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import List, Optional, Union
from typing_extensions import Self, Literal
from datasets import Dataset
[文档]
@dataclass
class DataTable:
"""
The data structure for a single-cell data table.
"""
name: str
data: Optional[Dataset] = None
@property
def is_loaded(self) -> bool:
return self.data is not None and isinstance(self.data, Dataset)
[文档]
def save(
self,
path: Union[Path, str],
format: Literal["json", "parquet"] = "json",
) -> None:
if not self.is_loaded:
raise ValueError("DataTable is not loaded.")
if isinstance(path, str):
path = Path(path)
if format == "json":
self.data.to_json(path)
elif format == "parquet":
self.data.to_parquet(path)
else:
raise ValueError(f"Unknown format: {format}")
[文档]
@dataclass
class MetaInfo:
"""
The data structure for meta info of a scBank data directory.
"""
on_disk_path: Union[Path, str, None] = None
on_disk_format: Literal["json", "parquet"] = "json"
main_table_key: Optional[str] = None
# TODO: use md5 to check the vocab file name on disk
gene_vocab_md5: Optional[str] = None
study_ids: Optional[List[int]] = field(
default=None,
metadata={"help": "List of study IDs"},
)
cell_ids: Optional[List[int]] = field(
default=None,
metadata={"help": "List of cell IDs"},
)
# md5: Optional[str] = field(
# default=None,
# metadata={"help": "MD5 hash of the gene vocabulary"},
# )
def __post_init__(self):
if self.on_disk_path is not None:
self.on_disk_path: Path = Path(self.on_disk_path)
[文档]
def save(self, path: Union[Path, str, None] = None) -> None:
"""
Save meta info to path. If path is None, will save to the same path at
:attr:`on_disk_path`.
"""
if path is None:
path = self.on_disk_path
if isinstance(path, str):
path = Path(path)
manifests = {
"on_disk_format": self.on_disk_format,
"main_data": self.main_table_key,
"gene_vocab_md5": self.gene_vocab_md5,
}
with open(path / "manifest.json", "w") as f:
json.dump(manifests, f, indent=2)
# TODO: currently only save study table, add saving other tables
with open(path / "studytable.json", "w") as f:
json.dump({"study_ids": self.study_ids}, f, indent=2)
[文档]
def load(self, path: Union[Path, str, None] = None) -> None:
"""
Load meta info from path. If path is None, will load from the same path
at :attr:`on_disk_path`.
"""
if path is None:
path = self.on_disk_path
if isinstance(path, str):
path = Path(path)
with open(path / "manifest.json") as f:
manifests = json.load(f)
self.on_disk_format = manifests["on_disk_format"]
self.main_table_key = manifests["main_data"]
self.gene_vocab_md5 = manifests["gene_vocab_md5"]
if (path / "studytable.json").exists():
with open(path / "studytable.json") as f:
study_ids = json.load(f)
self.study_ids = study_ids["study_ids"]
[文档]
@classmethod
def from_path(cls, path: Union[Path, str]) -> Self:
"""
Create a MetaInfo object from a path.
"""
if isinstance(path, str):
path = Path(path)
if not path.exists():
raise ValueError(f"Path {path} does not exist.")
if not path.is_dir():
raise ValueError(f"Path {path} is not a directory.")
if not (path / "manifest.json").exists():
raise ValueError(f"Path {path} does not contain manifest.json.")
meta_info = cls()
meta_info.on_disk_path = path
meta_info.load(path)
return meta_info