[docs]defdownload_dataset(url:str,dataset_name:str)->None:"""Function to download datasets :param url: url of the dataset :type url: str :param dataset_name: dataset name :type dataset_name: str :raises FileSizeError: an error is raised if the dataset is not downloaded properly """ifnotos.path.exists(DATASETS_ZIP_FOLDER):os.makedirs(DATASETS_ZIP_FOLDER)dataset_zip_path=f"{DATASETS_ZIP_FOLDER}/{dataset_name}.zip"ifdataset_zip_pathnotinos.listdir(DATASETS_ZIP_FOLDER):req=requests.get(url,stream=True)total_size=int(req.headers["content-length"])block_size=1024t=tqdm(total=total_size,unit="iB",unit_scale=True)withopen(dataset_zip_path,"wb")asfw:fordatainreq.iter_content(block_size):t.update(len(data))fw.write(data)t.close()try:iftotal_size!=0andt.n!=total_size:raiseFileSizeError()else:__extract_dataset_zip(dataset_zip_path,dataset_name)exceptFileSizeErroraserr:print(err.message)ifos.path.exists(dataset_zip_path):os.remove(dataset_zip_path)
[docs]defread_jsonl(dataset_path:str)->List[Dict[str,Any]]:"""Function to read a JSONL file :param dataset_path: path of the dataset :type dataset_path: str :return: list of JSON objects :rtype: List[Dict[str, Any]] """withopen(dataset_path,"r",encoding="utf-8")asout:jsonl=list(out)return[json.loads(i)foriinjsonl]
[docs]defread_json(dataset_path:str)->List[Dict[str,Any]]:"""Function to read a JSON file :param dataset_path: path of the dataset :type dataset_path: str :return: list of JSON objects :rtype: List[Dict[str, Any]] """withopen(dataset_path,"r",encoding="utf-8")asout:json_file=json.loads(out.read())returnjson_file
def__extract_dataset_zip(dataset_zip_path:str,dataset_name:str)->None:"""Function to extract a dataset in zip extension :param dataset_zip_path: dataset in zip extension on disk :type dataset_zip_path: str :param dataset_name: dataset name :type dataset_name: str """dataset_path=f"{DATASETS_FOLDER}/{dataset_name}"withZipFile(dataset_zip_path,"r")aszip_file:zip_file.extractall(dataset_path)