"""Utilities for extracting archivesSee the source code on Github: `mltk/utils/archive.py <https://github.com/siliconlabs/mltk/blob/master/mltk/utils/archive.py>`_"""importosimportreimporttarfileimportgzipimportstructimportshutilfromtypingimportCallable,Unionfrompatoolib.programsimporttar# pylint: disable=unused-importimportpatoolibfrom.pythonimportappend_exception_msg,prepend_exception_msg# We use a custom zipfile class which allows# for extracting large zipfiles on Windowsfrom.zipfile_win32importZipFilefrom.importpath
[docs]defextract_archive(archive_path:str,dest_dir:str,extract_nested:bool=False,remove_root_dir:bool=False,clean_dest_dir:Union[bool,Callable]=True):"""Extract the given archive file to the specified directory Args: archive_path: Path to archive file dest_dir: Path to directory where archive will be extracted extract_nested: If true and the give archive contains nested archive, then extract those as well remove_root_dir: If the archive has a root directory, then remove it from the extracted path clean_dest_dir: Clean the destination directory before extracting """ifclean_dest_dir:ifcallable(clean_dest_dir):clean_dest_dir()else:path.remove_directory(dest_dir)try:ifextract_nestedorremove_root_dir:_extractnested_archive(archive_path,dest_dir,extract_nested=extract_nested,remove_root_dir=remove_root_dir)elifarchive_path.endswith('.zip'):_extractall_zipfile(archive_path,dest_dir)elifarchive_path.endswith('.tar.gz'):_extractall_tarfile(archive_path,dest_dir)elifarchive_path.endswith('.gz'):_extractall_gzfile(archive_path,dest_dir)else:_extractall_patool(archive_path,dest_dir)exceptExceptionase:prepend_exception_msg(e,f'Failed to extract {archive_path} to {dest_dir}')raise
[docs]defgzip_file(src_path:str,dst_path:str=None)->str:"""GZip file and return path to gzip archive Args: src_path: Path to local file to gzip dst_path: Optional path to destination gzip file. If omitted then use src_path + .gz Return: Path to generated .gz file """ifnotdst_path:dst_path=src_path+'.gz'withopen(src_path,'rb')assrc:withgzip.open(dst_path,'wb')asdst:shutil.copyfileobj(src,dst)returndst_path
[docs]defgzip_directory_files(src_dir:str,dst_archive:str=None,regex:Union[str,re.Pattern,Callable[[str],bool]]=None,)->str:"""Recursively gzip all files in given directory. The generated .tar.gz contains the same directory structure as the src_dir. Args: src_dir: Path to directory to generated .tar.gz archive dst_archive: Path to generated .tar.gz. If omitted then use src_dir + .tar.gz regex: Optional regex of file paths to INCLUDE in the returned list This can either be a string, re.Pattern, or a callback function The tested path is the relative path to src_dir with forward slashes If a callback function is given, if the function returns True then the path is INCLUDED, else it is excluded Return: Path to generated .tar.gz """ifnotdst_archive:dst_archive=f'{os.path.dirname(os.path.abspath(src_dir))}/{os.path.basename(src_dir)}.tar.gz'ifregexisnotNone:ifisinstance(regex,str):regex=re.compile(regex)regex_func=regex.matchelifisinstance(regex,re.Pattern):regex_func=regex.matchelse:regex_func=regexelse:regex_func=lambda_:True# pylint: disable-unnecessary-lambda-assignmentwithtarfile.open(dst_archive,'w:gz')asdst:forroot,_,filesinos.walk(src_dir):forfninfiles:iffn==os.path.basename(dst_archive):continueabs_path=os.path.join(root,fn)rel_path=os.path.relpath(abs_path,src_dir).replace('\\','/')ifnotregex_func(rel_path):continuedst.add(abs_path,arcname=rel_path)returndst_archive
def_extractall_patool(archive_path,output_dir,patool_path=None):archive_path=path.fullpath(archive_path)output_dir=path.fullpath(output_dir)os.makedirs(output_dir,exist_ok=True)# Override the default tar command# so we can add the option: --force-local# This allows for running on Windowspatoolib.programs.tar.extract_tar=_extract_tartry:patoolib.extract_archive(archive_path,interactive=False,outdir=output_dir)exceptpatoolib.util.PatoolErrorase:prepend_exception_msg(e,f'Failed to extract archive: {archive_path} to {output_dir}')ifarchive_path.endswith('.gz'):raise# This is extremely hacky but works sometimes...# If extraction failed, try changing the extension gz and run againold=archive_pathbase,_=os.path.splitext(archive_path)archive_path=base+'.gz'try:os.remove(archive_path)except:passshutil.copy2(old,archive_path)try:patoolib.extract_archive(archive_path,interactive=False,outdir=output_dir)returnexceptpatoolib.util.PatoolError:passif'could not find an executable program to extract format 7z'inf'{e}':msg='\n\nIs 7zip installed on your computer? \n'ifos.name=='nt':msg+='You can download and install it from here: https://www.7-zip.org/download.html'else:msg+='You can install it with: sudo apt install p7zip-full'msg+='\n\n'append_exception_msg(e,msg)raisedef_extractall_zipfile(archive_path,output_dir):archive_path=path.fullpath(archive_path)output_dir=path.fullpath(output_dir)os.makedirs(output_dir,exist_ok=True)ZipFile(archive_path).extractall(output_dir)def_extractall_gzfile(archive_path,output_dir):archive_path=path.fullpath(archive_path)output_dir=path.fullpath(output_dir)withgzip.open(archive_path,'rb')asf_in:fname,_=_read_gzip_info(f_in)output_path=f'{output_dir}/{fname}'os.makedirs(os.path.dirname(output_path),exist_ok=True)withopen(output_path,'wb')asf_out:shutil.copyfileobj(f_in,f_out)def_extractall_tarfile(archive_path,output_dir):withtarfile.open(archive_path)astar_file:def_is_within_directory(directory:str,target:str):abs_directory=os.path.abspath(directory)abs_target=os.path.abspath(target)prefix=os.path.commonprefix([abs_directory,abs_target])returnprefix==abs_directorydef_safe_extract():# This fixes CVE-2007-4559: https://github.com/advisories/GHSA-gw9q-c7gh-j9vm,# which is a 15 year old bug in the Python tarfile package. By using extract() or extractall()# on a tarfile object without sanitizing input, a maliciously crafted .tar file could perform a directory path traversal attack.# We (Advanced Research Center at Trellix: https://www.trellix.com/) found at least one unsantized extractall() in your codebase and are providing a patch for you via pull request.# The patch essentially checks to see if all tarfile members will be extracted safely and throws an exception otherwise.# We encourage you to use this patch or your own solution to secure against CVE-2007-4559.# Further technical information about the vulnerability can be found in this blog:# https://www.trellix.com/en-us/about/newsroom/stories/research/tarfile-exploiting-the-world.html.formemberintar_file.getmembers():member_path=os.path.join(output_dir,member.name)ifnot_is_within_directory(output_dir,member_path):raiseRuntimeWarning(f"Attempted path traversal in TAR file: {archive_path}, archive path {member_path} not within {output_dir}")tar_file.extractall(output_dir)_safe_extract()def_extractnested_archive(archive_path:str,output_dir:str,extract_nested:bool,remove_root_dir:bool):ext=path.extension(archive_path)ifnotext:raiseValueError(f'Archive path: {archive_path} does not have a valid file extension')ext='.'+exttmp_dir=path.create_tempdir('tmp_archives/'+os.path.basename(archive_path).replace(ext,''))extract_archive(archive_path,tmp_dir,clean_dest_dir=True)ifextract_nested:nested_archive_path=Noneforroot,_,filesinos.walk(tmp_dir):ifnested_archive_pathisnotNone:breakforfninfiles:iffn.endswith(patoolib.ArchiveFormats+('gz','bz','bz2')):nested_archive_path=os.path.join(root,fn)breakifnested_archive_pathisNone:raiseRuntimeError(f'No nested archive found in {archive_path}')ext=path.extension(archive_path)nested_tmp_dir=tmp_dir+'/'+os.path.basename(nested_archive_path).replace(ext,'')extract_archive(nested_archive_path,nested_tmp_dir,clean_dest_dir=False)nested_src_dir=Noneforroot,_,filesinos.walk(nested_tmp_dir):iflen(files)>0:nested_src_dir=rootbreakelse:nested_src_dir=Noneforfninos.listdir(tmp_dir):p=f'{tmp_dir}/{fn}'ifos.path.isfile(p)or(os.path.isdir(p)andnested_src_dirisnotNone):raiseRuntimeError('Archive does not contain a single root directory')nested_src_dir=pifnested_src_dirisnotNone:path.copy_directory(nested_src_dir,output_dir)# This overrides the default function in:# patoolib.programs.tar# It adds: "--force-local" to the command-line so that it can run on Windowsdef_extract_tar(archive,compression,cmd,verbosity,interactive,outdir):"""Extract a TAR archive."""cmdlist=[cmd,'--extract','--force-local']patoolib.programs.tar.add_tar_opts(cmdlist,compression,verbosity)cmdlist.extend(["--file",archive,'--directory',outdir])returncmdlistdef_read_gzip_info(gzipfile:gzip.GzipFile)->tuple:"""Read the metadata from a gz file Returns: tuple(filename, size) """gf=gzipfile.fileobjpos=gf.tell()# Read archive sizegf.seek(-4,2)size=struct.unpack('<I',gf.read())[0]gf.seek(0)magic=gf.read(2)ifmagic!=b'\037\213':raiseIOError('Not a gzipped file')_,flag,_=struct.unpack("<BBIxx",gf.read(8))ifnotflag&gzip.FNAME:# Not stored in the header, use the filename sans .gzgf.seek(pos)fname=gzipfile.nameiffname.endswith('.gz'):fname=fname[:-3]returnfname,sizeifflag&gzip.FEXTRA:# Read & discard the extra field, if presentgf.read(struct.unpack("<H",gf.read(2)))# Read a null-terminated string containing the filenamefname_bytes=bytearray()whileTrue:s=gf.read(1)ifnotsors==b'\000':breakfname_bytes.extend(s)gf.seek(pos)fname=fname_bytes.decode('utf-8')returnfname,size
Important: We use cookies only for functional and traffic analytics.
We DO NOT use cookies for any marketing purposes. By using our site you acknowledge you have read and understood our Cookie Policy.