Source code for fileorganize.utils

# Utility functions.
__all__=['today_YYYYMMDD', 'timestamp_now', 'cp_backup', 'dir_to_df']

import os
import pandas as pd
import numpy as np
import re
import fnmatch
import shutil
from datetime import datetime
import dateutil

[docs] def dir_to_df(dirname, fnpat=None, dirpat=None, addcols=[], sentinel='', to_datetime=True, dotfiles=False, dotdirs=False, sort_by=['relpath', 'fname'], **kwargs): '''`dir_to_df`: Recursively generate the filenames in a directory tree using `os.walk` and store as rows of a DataFrame. 'Hidden' files and directories (those with names that start with '.') are ignored by default. `dir_to_df` will also not descend into a directory tree that contains a sentinel file. Additional parameters can be used to filter which filepaths to include in the output, and also to add additional file metadata. Parameters ---------- dirname : str Top-level directory name for filename search. fnpat : str, re, default=None Regular expression pattern that defines the filenames to return. The only filenames in the result set will be those that return a match for `re.search(fnpat, filename)`. .. note:: If you use named captures in `fnpat`, new columns corresponding to the capture groups will be added to the output dataframe as dtype 'Categorical'. If you need to use a flag with your pattern, you can use a precompiled regex for the value of `fnpat`. For example, you can do case-insensitive matching with `re.compile(pattern, re.IGNORECASE)`. dirpat : str, re, default=None Same as `fnpat`, only applied against the relative path in dirname. Relative paths that do not match `dirpat` will be skipped. addcols : str, list of str, default=[] One or more additional columns to include in the output. Possible names and values provided are: .. table:: Columns that can be added :widths: auto ========== ====================================== Name value ========== ====================================== 'dirname' the user-provided top-level directory 'barename' the filename without path or extension 'ext' the filename extension 'mtime' the last modification time of the file 'bytes' the size of the file in bytes ========== ====================================== The 'mtime' column is cast to Pandas Timestamps automatically unless `to_datetime` is False. Resolution of the time-based stats is dependent on your platform; see the `os.stat` documentation. sentinel : str, default='' Name of the sentinel file, which marks a directory tree to be ignored. No filenames from the directory containing the sentinel file will be included in the output, nor will any filenames from any of its subdirectories. If the value of `sentinel` is '' or None, the sentinel file check will not be performed. to_datetime : boolean, default=True If True, 'mtime' stats will be converted from Unix epoch to datetime. If False, the values will not be converted. dotfiles : boolean, default=False If True, include filenames beginning with `.` in the output. Otherwise, omit these names. dotdirs : boolean, default=False If True, descend into directories with names that begin with `.`. If False, do not descend into these directories. sort_by : list of str, default=['relpath', 'fname'] Sort output dataframe rows by the columns named in the list. Specify an empty list `[]` if no sorting is desired. kwargs : various Remaining kwargs are passed to `os.walk`. If not used, then `os.walk` will be called with default kwargs. Note that using `os.walk(topdown=False)` is not compatible with `dotdirs=False`. Returns ------- fnamedf : DataFrame Dataframe of filename rows. Example ------- In this example we list the .wav files in the Dimex corpus of Mexican Spanish. A typical file name is `s09101.wav`, which indicates that the file is of speaker `s091` reading sentence `01`. To capture this information the **fnpat** variable has two named captures. The first ``(?P<subj>s\d\d\d)`` says to parse the filename and find a sequence of 's' followed by three digits, keeping that sequence in the variable `subj`. The second named capture ``(?P<sentence>\d+)`` says to keep the remaining one or more digits in the variable `sentence`. Note the final two columns in the dataframe. .. code-block:: Python path_to_corpus = Path('./dimex') corpus_list = dir_to_df(path_to_corpus, fnpat = r'(?P<subj>s\d\d\d)(?P<sentence>\d+)\.wav', addcols = ["dirname", "barename"]) corpus_list.head() .. figure:: images/dir_to_df.png :scale: 50 % :alt: The first few lines of the dataframe corpus_list, which was created by the above code. :align: center The first few lines of the dataframe corpus_list, which was created by the above code. ''' # Coerce addcols to list if passed as single string. try: assert isinstance(addcols, str) addcols = [addcols] except AssertionError: pass # Should be a list already. if 'dirname' in addcols: firstcols = ['dirname', 'relpath', 'fname'] addcols[:] = [c for c in addcols if c != 'dirname'] else: firstcols = ['relpath', 'fname'] mdcols = [] # Names of additional metadata columns from named captures. for pat in (dirpat, fnpat): if pat is None: continue pat = re.compile(pat) newmdcols = list(pat.groupindex.keys()) for c in newmdcols: try: assert(c not in firstcols + addcols + mdcols) except AssertionError: msg = f'Named group {c} masks another output column.' raise RuntimeError(msg) mdcols += newmdcols stats = {'bytes': 'st_size'} if 'bytes' in addcols else {} if 'mtime' in addcols: stats['mtime'] = 'st_mtime' if dotdirs is False: try: assert(kwargs['topdown'] is True) except AssertionError: msg = '`topdown=False` not compatible with `dotdirs=False`' raise RuntimeError(msg) except KeyError: pass recs = [] for root, dirs, files in os.walk(dirname, **kwargs): if sentinel in files: dirs[:] = [] # Do not descend into subdirectories. continue # Do not include files in this directory. relpath = os.path.relpath(root, dirname) dircols = [] if dirpat is not None: # TODO: don't call compile in this loop dirpat = re.compile(dirpat) dirm = dirpat.search(relpath) if dirm is None: continue # Do not include files in this directory. # Add named capture groups and replace unmatched optional # named captures with empty string. dircols = [ '' if s is None else s for s in dirm.groupdict().values() ] for name in files: if (dotfiles is False) and (name[0] == '.'): continue fncols = [] if fnpat is not None: # TODO: don't call compile in this loop fnpat = re.compile(fnpat) fnm = fnpat.search(name) if fnm is None: continue # Do not include this file. # Add named capture groups and replace unmatched optional # named captures with empty string. fncols = [ '' if s is None else s for s in fnm.groupdict().values() ] reccols = [] if stats != {}: st = os.stat(os.path.join(root, name)) if ('barename' in addcols) or ('ext' in addcols): (barename, ext) = os.path.splitext(name) for col in firstcols + addcols: if col in ['bytes', 'mtime']: reccols += [getattr(st, stats[col])] elif col == 'relpath': reccols += [relpath] elif col == 'fname': reccols += [name] elif col == 'dirname': reccols += [dirname] elif col == 'barename': reccols += [barename] elif col == 'ext': reccols += [ext] recs.append(reccols + dircols + fncols) # Change dirs in-place to prevent os.walk() from descending into # '.' directories. if dotdirs is False: dirs[:] = [d for d in dirs if not d[0] == '.'] df = pd.DataFrame( recs, columns=firstcols + addcols + mdcols ) if sort_by != []: df = df.sort_values( by=sort_by, axis='rows' ).reset_index(drop=True) if len(df) > 0: # Cast named captured columns to Categorical. catcols = ['dirname', 'relpath', 'fname', 'barename', 'ext'] + mdcols df = df.astype({c: 'category' for c in df.columns if c in catcols}) if to_datetime is True and 'mtime' in df.columns: df.loc[:, 'mtime'] = pd.to_datetime(df.loc[:, 'mtime'], unit='s') return df
[docs] def today_YYYYMMDD(): """ Return today's date in YYYYMMDD format. """ return timestamp_now()[0].split('T')[0].replace('-', '')
[docs] def timestamp_now(): """ Create a timestamp for an acquisition, using current local time. Returns ======= timestamp, utcoffset : tuple(str, str) A tuple of strings representing the datetime in YYYY-MM-DDTHHMMSS format and the timezone offset from UTC, e.g. '-0700'. """ # Regex that matches a timezone offset at the end of an acquisition # directory name. utcoffsetre = re.compile( r'(?P<offset>(?P<sign>[-+])(?P<hours>0\d|1[12])(?P<minutes>[012345]\d))' ) timestamp = datetime.now(dateutil.tz.tzlocal()) \ .replace(microsecond=0) \ .isoformat() \ .replace(":","") m = utcoffsetre.search(timestamp) utcoffset = m.group('offset') timestamp = utcoffsetre.sub('', timestamp) return (timestamp, utcoffset)
[docs] def cp_backup(fname, bkdir=None, hidden=True): """Make a backup copy of the file `fname` and return the name of the copied file. By default the copy will have the same name as `fname` with '.' prepended and a suffix of the form '.N', where N is an integer. Multiple calls to this function result in increasing values of N, starting with '1'. Parameters ========== fname : str The name of the file to be copied. bkdir : str (default: None) By default, the backup file will be written in the same directory as the source file. If `bkdir` is provided, the backup file will be written to that path instead. A `FileNotFoundError` will be thrown if the backup directory does not exist. hidden : bool (default: True) If True, prepend '.' to the backup filename, resulting in a 'hidden' file. If not True, do not prepend anything. Returns ======= dst : str The name of the copied backup file. """ if bkdir is None: bkdir = os.path.dirname(fname) if os.path.dirname(fname) != '' else '.' basename = os.path.basename(fname) cpname = '.' + basename if hidden is True else basename # Get extension integers from backups that already exist and find max N. rgx = re.compile( fnmatch.translate(cpname).replace('\\Z', '\\.[0-9]+\\Z') ) Ns = np.array([ int(os.path.splitext(f)[1].lstrip('.')) \ for f in os.listdir(bkdir) if rgx.fullmatch(f) ], dtype=int) maxN = 0 if len(Ns) == 0 else Ns.max() cpname += '.{:d}'.format(maxN + 1) dst = os.path.join(bkdir, cpname) shutil.copyfile(fname, dst) return dst