# Utility functions.
__all__=['today_YYYYMMDD', 'timestamp_now', 'cp_backup', 'dir_to_df']
import os
import pandas as pd
import numpy as np
import re
import fnmatch
import shutil
from datetime import datetime
import dateutil
[docs]
def dir_to_df(dirname, fnpat=None, dirpat=None, addcols=[], sentinel='',
to_datetime=True, dotfiles=False, dotdirs=False, sort_by=['relpath', 'fname'], **kwargs):
'''`dir_to_df`: Recursively generate the filenames in a directory tree
using `os.walk` and store as rows of a DataFrame.
'Hidden' files and directories (those with names that start with '.') are
ignored by default. `dir_to_df` will also not descend into a directory tree that
contains a sentinel file.
Additional parameters can be used to filter which filepaths to include in the
output, and also to add additional file metadata.
Parameters
----------
dirname : str
Top-level directory name for filename search.
fnpat : str, re, default=None
Regular expression pattern that defines the filenames to return.
The only filenames in the result set will be those that return a match
for `re.search(fnpat, filename)`.
.. note::
If you use named captures in `fnpat`, new columns corresponding to the
capture groups will be added to the output dataframe as dtype 'Categorical'.
If you need to use a flag with your pattern, you can use a precompiled
regex for the value of `fnpat`. For example, you can do
case-insensitive matching with `re.compile(pattern, re.IGNORECASE)`.
dirpat : str, re, default=None
Same as `fnpat`, only applied against the relative path in dirname.
Relative paths that do not match `dirpat` will be skipped.
addcols : str, list of str, default=[]
One or more additional columns to include in the output. Possible names
and values provided are:
.. table:: Columns that can be added
:widths: auto
========== ======================================
Name value
========== ======================================
'dirname' the user-provided top-level directory
'barename' the filename without path or extension
'ext' the filename extension
'mtime' the last modification time of the file
'bytes' the size of the file in bytes
========== ======================================
The 'mtime' column is cast to Pandas Timestamps automatically unless
`to_datetime` is False. Resolution of the time-based stats is dependent
on your platform; see the `os.stat` documentation.
sentinel : str, default=''
Name of the sentinel file, which marks a directory tree to be ignored. No
filenames from the directory containing the sentinel file will be included
in the output, nor will any filenames from any of its subdirectories.
If the value of `sentinel` is '' or None, the sentinel file check will not
be performed.
to_datetime : boolean, default=True
If True, 'mtime' stats will be converted from Unix epoch to datetime.
If False, the values will not be converted.
dotfiles : boolean, default=False
If True, include filenames beginning with `.` in the output. Otherwise,
omit these names.
dotdirs : boolean, default=False
If True, descend into directories with names that begin with `.`. If
False, do not descend into these directories.
sort_by : list of str, default=['relpath', 'fname']
Sort output dataframe rows by the columns named in the list. Specify an empty
list `[]` if no sorting is desired.
kwargs : various
Remaining kwargs are passed to `os.walk`. If not used, then `os.walk` will
be called with default kwargs. Note that using `os.walk(topdown=False)` is
not compatible with `dotdirs=False`.
Returns
-------
fnamedf : DataFrame
Dataframe of filename rows.
Example
-------
In this example we list the .wav files in the Dimex corpus of Mexican Spanish.
A typical file name is `s09101.wav`, which indicates that the file is of speaker
`s091` reading sentence `01`. To capture this information the **fnpat** variable has two named captures.
The first ``(?P<subj>s\d\d\d)`` says to parse the filename and find a sequence of 's'
followed by three digits, keeping that sequence in the variable `subj`. The second named capture
``(?P<sentence>\d+)`` says to keep the remaining one or more digits in the
variable `sentence`. Note the final two columns in the dataframe.
.. code-block:: Python
path_to_corpus = Path('./dimex')
corpus_list = dir_to_df(path_to_corpus,
fnpat = r'(?P<subj>s\d\d\d)(?P<sentence>\d+)\.wav',
addcols = ["dirname", "barename"])
corpus_list.head()
.. figure:: images/dir_to_df.png
:scale: 50 %
:alt: The first few lines of the dataframe corpus_list, which was created by the above code.
:align: center
The first few lines of the dataframe corpus_list, which was created by the above code.
'''
# Coerce addcols to list if passed as single string.
try:
assert isinstance(addcols, str)
addcols = [addcols]
except AssertionError:
pass # Should be a list already.
if 'dirname' in addcols:
firstcols = ['dirname', 'relpath', 'fname']
addcols[:] = [c for c in addcols if c != 'dirname']
else:
firstcols = ['relpath', 'fname']
mdcols = [] # Names of additional metadata columns from named captures.
for pat in (dirpat, fnpat):
if pat is None:
continue
pat = re.compile(pat)
newmdcols = list(pat.groupindex.keys())
for c in newmdcols:
try:
assert(c not in firstcols + addcols + mdcols)
except AssertionError:
msg = f'Named group {c} masks another output column.'
raise RuntimeError(msg)
mdcols += newmdcols
stats = {'bytes': 'st_size'} if 'bytes' in addcols else {}
if 'mtime' in addcols:
stats['mtime'] = 'st_mtime'
if dotdirs is False:
try:
assert(kwargs['topdown'] is True)
except AssertionError:
msg = '`topdown=False` not compatible with `dotdirs=False`'
raise RuntimeError(msg)
except KeyError:
pass
recs = []
for root, dirs, files in os.walk(dirname, **kwargs):
if sentinel in files:
dirs[:] = [] # Do not descend into subdirectories.
continue # Do not include files in this directory.
relpath = os.path.relpath(root, dirname)
dircols = []
if dirpat is not None:
# TODO: don't call compile in this loop
dirpat = re.compile(dirpat)
dirm = dirpat.search(relpath)
if dirm is None:
continue # Do not include files in this directory.
# Add named capture groups and replace unmatched optional
# named captures with empty string.
dircols = [
'' if s is None else s for s in dirm.groupdict().values()
]
for name in files:
if (dotfiles is False) and (name[0] == '.'):
continue
fncols = []
if fnpat is not None:
# TODO: don't call compile in this loop
fnpat = re.compile(fnpat)
fnm = fnpat.search(name)
if fnm is None:
continue # Do not include this file.
# Add named capture groups and replace unmatched optional
# named captures with empty string.
fncols = [
'' if s is None else s for s in fnm.groupdict().values()
]
reccols = []
if stats != {}:
st = os.stat(os.path.join(root, name))
if ('barename' in addcols) or ('ext' in addcols):
(barename, ext) = os.path.splitext(name)
for col in firstcols + addcols:
if col in ['bytes', 'mtime']:
reccols += [getattr(st, stats[col])]
elif col == 'relpath':
reccols += [relpath]
elif col == 'fname':
reccols += [name]
elif col == 'dirname':
reccols += [dirname]
elif col == 'barename':
reccols += [barename]
elif col == 'ext':
reccols += [ext]
recs.append(reccols + dircols + fncols)
# Change dirs in-place to prevent os.walk() from descending into
# '.' directories.
if dotdirs is False:
dirs[:] = [d for d in dirs if not d[0] == '.']
df = pd.DataFrame(
recs,
columns=firstcols + addcols + mdcols
)
if sort_by != []:
df = df.sort_values(
by=sort_by, axis='rows'
).reset_index(drop=True)
if len(df) > 0:
# Cast named captured columns to Categorical.
catcols = ['dirname', 'relpath', 'fname', 'barename', 'ext'] + mdcols
df = df.astype({c: 'category' for c in df.columns if c in catcols})
if to_datetime is True and 'mtime' in df.columns:
df.loc[:, 'mtime'] = pd.to_datetime(df.loc[:, 'mtime'], unit='s')
return df
[docs]
def today_YYYYMMDD():
"""
Return today's date in YYYYMMDD format.
"""
return timestamp_now()[0].split('T')[0].replace('-', '')
[docs]
def timestamp_now():
"""
Create a timestamp for an acquisition, using current local time.
Returns
=======
timestamp, utcoffset : tuple(str, str)
A tuple of strings representing the datetime in YYYY-MM-DDTHHMMSS format and the timezone offset from UTC, e.g. '-0700'.
"""
# Regex that matches a timezone offset at the end of an acquisition
# directory name.
utcoffsetre = re.compile(
r'(?P<offset>(?P<sign>[-+])(?P<hours>0\d|1[12])(?P<minutes>[012345]\d))'
)
timestamp = datetime.now(dateutil.tz.tzlocal()) \
.replace(microsecond=0) \
.isoformat() \
.replace(":","")
m = utcoffsetre.search(timestamp)
utcoffset = m.group('offset')
timestamp = utcoffsetre.sub('', timestamp)
return (timestamp, utcoffset)
[docs]
def cp_backup(fname, bkdir=None, hidden=True):
"""Make a backup copy of the file `fname` and return the name of the copied file.
By default the copy will have the same name as `fname` with '.' prepended and a suffix of the form '.N', where N is an integer. Multiple calls to this function result in increasing values of N, starting with '1'.
Parameters
==========
fname : str
The name of the file to be copied.
bkdir : str (default: None)
By default, the backup file will be written in the same directory as the source file. If `bkdir` is provided, the backup file will be written to that path instead. A `FileNotFoundError` will be thrown if the backup directory does not exist.
hidden : bool (default: True)
If True, prepend '.' to the backup filename, resulting in a 'hidden' file. If not True, do not prepend anything.
Returns
=======
dst : str
The name of the copied backup file.
"""
if bkdir is None:
bkdir = os.path.dirname(fname) if os.path.dirname(fname) != '' else '.'
basename = os.path.basename(fname)
cpname = '.' + basename if hidden is True else basename
# Get extension integers from backups that already exist and find max N.
rgx = re.compile(
fnmatch.translate(cpname).replace('\\Z', '\\.[0-9]+\\Z')
)
Ns = np.array([
int(os.path.splitext(f)[1].lstrip('.')) \
for f in os.listdir(bkdir) if rgx.fullmatch(f)
], dtype=int)
maxN = 0 if len(Ns) == 0 else Ns.max()
cpname += '.{:d}'.format(maxN + 1)
dst = os.path.join(bkdir, cpname)
shutil.copyfile(fname, dst)
return dst