Source code for filecache.file_cache_path

##########################################################################################
# filecache/file_cache_path.py
##########################################################################################

from __future__ import annotations

import contextlib
import functools
import os
from pathlib import Path
import re
import sys
from typing import (cast,
                    Any,
                    Callable,
                    Generator,
                    Iterator,
                    IO,
                    Optional,
                    TYPE_CHECKING)

if TYPE_CHECKING:  # pragma: no cover
    from .file_cache import FileCache  # Circular import

from .file_cache_types import (StrOrPathOrSeqType,
                               UrlToPathFuncOrSeqType,
                               UrlToUrlFuncOrSeqType)


# This FileCache is used when an FCPath is created without specifying a particular
# FileCache and the FCPath is actually used to perform an operation that needs that
# FileCache.
_DEFAULT_FILECACHE: Optional[FileCache] = None


[docs] class FCPath: """Rewrite of the Python pathlib.Path class that supports URLs and FileCache. This class provides a simpler way to abstract away remote access in a FileCache by emulating the Python pathlib.Path class. At the same time, it can collect common parameters (`anonymous`, `lock_timeout`, `nthreads`) into a single location so that they do not have to be specified on every method call. """ _filecache: Optional["FileCache"] _anonymous: Optional[bool] _lock_timeout: Optional[int] _nthreads: Optional[int] _url_to_url: Optional[UrlToUrlFuncOrSeqType] _url_to_path: Optional[UrlToPathFuncOrSeqType]
[docs] def __init__(self, *paths: str | Path | FCPath | None, filecache: Optional["FileCache"] = None, anonymous: Optional[bool] = None, lock_timeout: Optional[int] = None, nthreads: Optional[int] = None, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None, copy_from: Optional[FCPath] = None ): """Initialization for the FCPath class. Parameters: paths: The path(s). These may be absolute or relative paths. They are joined together to form a final path. File operations can only be performed on absolute paths. file_cache: The :class:`FileCache` in which to store files retrieved from this path. If not specified, the default global :class:`FileCache` will be used. anonymous: If True, access cloud resources without specifying credentials. If False, credentials must be initialized in the program's environment. If None, use the default setting for the associated :class:`FileCache` instance. lock_timeout: How long to wait, in seconds, if another process is marked as retrieving the file before raising an exception. 0 means to not wait at all. A negative value means to never time out. None means to use the default value for the associated :class:`FileCache` instance. nthreads: The maximum number of threads to use when doing multiple-file retrieval or upload. If None, use the default value for the associated :class:`FileCache` instance. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default translators for the associated :class:`FileCache` instance. copy_from: An FCPath instance to copy internal parameters (`file_cache`, `anonymous`, `lock_timeout`, `nthreads`, `url_to_url`, and `url_to_path`) from. If specified, any values for these parameters in this constructor are ignored. Used internally and should not be used by external programmers. """ self._path = self._join(*paths) if copy_from is None and len(paths) > 0 and isinstance(paths[0], FCPath): copy_from = paths[0] if copy_from is not None: self._filecache = copy_from._filecache self._anonymous = copy_from._anonymous self._lock_timeout = copy_from._lock_timeout self._nthreads = copy_from._nthreads self._url_to_url = copy_from._url_to_url self._url_to_path = copy_from._url_to_path else: self._filecache = filecache self._anonymous = anonymous self._lock_timeout = lock_timeout if nthreads is not None and (not isinstance(nthreads, int) or nthreads <= 0): raise ValueError(f'nthreads must be a positive integer, got {nthreads}') self._nthreads = nthreads self._url_to_url = url_to_url self._url_to_path = url_to_path self._pathlib: Optional[Path] = None self._upload_counter = 0 self._download_counter = 0
def _validate_nthreads(self, nthreads: Optional[int]) -> int | None: if nthreads is not None and (not isinstance(nthreads, int) or nthreads <= 0): raise ValueError(f'nthreads must be a positive integer, got {nthreads}') if nthreads is None: nthreads = self._nthreads return nthreads @staticmethod def _split_parts(path: str | Path) -> tuple[str, str, str]: """Split a path into drive, root, and remainder of path.""" from .file_cache import FileCache # Circular import avoidance path = str(path).replace('\\', '/') drive = '' root = '' if len(path) >= 2 and path[0].isalpha() and path[1] == ':': # Windows C: drive = path[0:2].upper() path = path[2:] elif path.startswith('//'): # UNC //host/share path2 = path[2:] try: idx = path2.index('/') except ValueError: raise ValueError(f'UNC path does not include share name {path!r}') if idx == 0: raise ValueError(f'UNC path does not include hostname {path!r}') try: idx2 = path2[idx+1:].index('/') except ValueError: # It's just a share name like //host/share drive = path path = '' else: # It's a share plus path like //host/share/path # We include the leading / if idx2 == 0: raise ValueError(f'UNC path does not include share {path!r}') drive = path[:idx+idx2+3] path = path[idx+idx2+3:] elif path.startswith(FileCache.registered_scheme_prefixes()): # Cloud idx = path.index('://') path2 = path[idx+3:] if path2 == '': raise ValueError(f'URI does not include remote name {path!r}') try: idx2 = path2.index('/') except ValueError: # It's just a remote name like gs://bucket; we still make it absolute drive = path path = '/' else: # It's a remote name plus path like gs://bucket/path # We include the leading / if idx2 == 0 and not path.startswith('file://'): raise ValueError(f'URI does not include remote name {path!r}') drive = path[:idx+idx2+3] path = path[idx+idx2+3:] if path.startswith('/'): root = '/' if path != root: path = path.rstrip('/') return drive, root, path @staticmethod def _split(path: str) -> tuple[str, str]: """Split a path into head,tail similarly to os.path.split.""" if path == '': return '', '' drive, root, subpath = FCPath._split_parts(path) if '/' not in subpath: return drive, subpath if root == '/' and subpath == root: return drive + '/', '' idx = subpath.rindex('/') if idx == 0: return drive + '/', subpath[idx+1:] return drive + subpath[:idx].rstrip('/'), subpath[idx+1:] @staticmethod def _is_absolute(path: str) -> bool: """Check if a path string is an absolute path.""" return FCPath._split_parts(path)[1] == '/' @staticmethod def _join(*paths: str | Path | FCPath | None) -> str: """Join multiple strings together into a single path. Any time an absolute path is found in the path list, the new path starts over. """ ret = '' for path in paths: if path is None: continue if not isinstance(path, (str, Path, FCPath)): raise TypeError(f'path {path!r} is not a str, Path, or FCPath') path = str(path) if not path: continue drive, root, subpath = FCPath._split_parts(path) while '//' in subpath: subpath = subpath.replace('//', '/') if root == '/': # Absolute path - start over ret = '' if ret == '': ret = drive elif ret != '' and ret[-1] != '/' and subpath != '' and subpath[0] != '/': ret += '/' if not (subpath == '/' and '://' in drive): ret = ret + subpath return ret @staticmethod def _filename(path: str) -> str: """Return just the filename part of a path.""" _, _, subpath = FCPath._split_parts(path) if '/' not in subpath: return subpath return subpath[subpath.rfind('/') + 1:] @property def _stack(self) -> tuple[str, list[str]]: """Split the path into a 2-tuple (anchor, parts). *anchor* is the uppermost parent of the path (equivalent to path.parents[-1]), and *parts* is a reversed list of parts following the anchor. """ path = self._path parent, name = FCPath._split(path) names = [] while path != parent: names.append(name) path = parent parent, name = FCPath._split(path) return path, names
[docs] def __str__(self) -> str: return self._path
@property def path(self) -> str: """Return this path as a string.""" return self._path
[docs] def as_pathlib(self) -> Path: """Return this path as a pathlib Path object.""" if self._pathlib is None: if not self.is_local(): raise ValueError(f'Cannot convert {self} to pathlib.Path') self._pathlib = Path(self._path) return self._pathlib
[docs] def as_posix(self) -> str: """Return this FCPath as a POSIX path. This is a str using only forward slashes. Notes: Because URLs are not really supported in POSIX format, we just return the URL as-is, including any scheme and remote. Returns: This path as a POSIX path. """ return self._path
@property def drive(self) -> str: """The drive associated with this FCPath. Notes: Examples: For a Windows path: '' or 'C:' For a UNC share: '//host/share' For a cloud resource: 'gs://bucket' """ return self._split_parts(self._path)[0] @property def root(self) -> str: """The root of this FCPath; '/' if absolute, '' otherwise.""" return self._split_parts(self._path)[1] @property def anchor(self) -> str: """The anchor of this FCPath, which is drive + root.""" return ''.join(self._split_parts(self._path)[0:2]) @property def suffix(self) -> str: """The final component's last suffix, if any, including the leading period.""" name = FCPath._filename(self._path) i = name.rfind('.') if 0 < i < len(name) - 1: return name[i:] else: return '' @property def suffixes(self) -> list[str]: """A list of the final component's suffixes, including the leading periods.""" name = FCPath._filename(self._path) if name.endswith('.'): return [] name = name.lstrip('.') return ['.' + suffix for suffix in name.split('.')[1:]] @property def stem(self) -> str: """The final path component, minus its last suffix.""" name = FCPath._filename(self._path) i = name.rfind('.') if 0 < i < len(name) - 1: return name[:i] else: return name
[docs] def with_name(self, name: str) -> FCPath: """Return a new FCPath with the filename changed. Parameters: name: The new filename to replace the final path component with. Returns: A new FCPath with the final component replaced. The new FCPath will have the same parameters (`filecache`, etc.) as the source FCPath. """ drive, root, subpath = FCPath._split_parts(self._path) drive2, root2, subpath2 = FCPath._split_parts(name) if drive2 != '' or root2 != '' or subpath2 == '' or '/' in subpath2: raise ValueError(f"Invalid name {name!r}") if '/' not in subpath: return FCPath(drive + name, copy_from=self) return FCPath(drive + subpath[:subpath.rfind('/')+1:] + name, copy_from=self)
[docs] def with_stem(self, stem: str) -> FCPath: """Return a new FCPath with the stem (the filename minus the suffix) changed. Parameters: stem: The new stem. Returns: A new FCPath with the final component's stem replaced. The new FCPath will have the same parameters (`filecache`, etc.) as the source FCPath. """ suffix = self.suffix if not suffix: return self.with_name(stem) elif not stem: # If the suffix is non-empty, we can't make the stem empty. raise ValueError(f"{self!r} has a non-empty suffix") else: return self.with_name(stem + suffix)
[docs] def with_suffix(self, suffix: str) -> FCPath: """Return a new FCPath with the file suffix changed. If the path has no suffix, add the given suffix. If the given suffix is an empty string, remove the suffix from the path. Parameters: suffix: The new suffix to use. Returns: A new FCPath with the final component's suffix replaced. The new FCPath will have the same parameters (`filecache`, etc.) as the source FCPath. """ stem = self.stem if not stem: # If the stem is empty, we can't make the suffix non-empty. raise ValueError(f"{self!r} has an empty name") elif suffix and not (suffix.startswith('.') and len(suffix) > 1): raise ValueError(f"Invalid suffix {suffix!r}") else: return self.with_name(stem + suffix)
@property def parts(self) -> tuple[str, ...]: """An object providing sequence-like access to the components in the path.""" anchor, parts = self._stack if anchor: parts.append(anchor) return tuple(reversed(parts))
[docs] def joinpath(self, *pathsegments: str | Path | FCPath | None) -> FCPath: """Combine this path with additional paths. Parameters: pathsegments: One or more additional paths to join with this path. Returns: A new FCPath that is a combination of this path and the additional paths. The new FCPath will have the same parameters (`filecache`, etc.) as the source FCPath. """ return FCPath(self._path, *pathsegments, copy_from=self)
[docs] def __truediv__(self, other: str | Path | FCPath | None) -> FCPath: """Combine this path with an additional path. Parameters: other: The path to join with this path. Returns: A new FCPath that is a combination of this path and the other path. The new FCPath will have the same parameters (`filecache`, etc.) as the current FCPath. """ return FCPath(self._path, other, copy_from=self)
[docs] def __rtruediv__(self, other: str | Path | FCPath) -> FCPath: """Combine an additional path with this path. Parameters: other: The path to join with this path. Returns: A new FCPath that is a combination of the other path and this path. The new FCPath will have the same parameters (`filecache`, etc.) as the other path if the other path is an FCPath; otherwise it will have the same parameters as the current FCPath. """ if isinstance(other, FCPath): # pragma: no cover # This shouldn't be possible to hit because __truediv__ will catch it return FCPath(other, self._path, copy_from=other) else: return FCPath(other, self._path, copy_from=self)
[docs] def splitpath(self, search_dir: str) -> tuple[FCPath, ...]: """Split the path into a list of FCPaths at each occurrence of search_dir. Parameters: search_dir: The directory to search for. Returns: A tuple of FCPaths, each of which is a segment of the path between instances of search_dir, not including the search_dir itself. """ parts = self.parts indices = [i for i, part in enumerate(parts) if part == search_dir] indices = [-1] + indices + [len(parts)] return tuple(FCPath(*parts[i+1:j], copy_from=self) for i, j in zip(indices[:-1], indices[1:]))
[docs] def __repr__(self) -> str: parts = [repr(self._path)] if self._filecache is not None: parts.append(f'filecache={self._filecache!r}') if self._anonymous is not None: parts.append(f'anonymous={self._anonymous!r}') if self._lock_timeout is not None: parts.append(f'lock_timeout={self._lock_timeout!r}') if self._nthreads is not None: parts.append(f'nthreads={self._nthreads!r}') if self._url_to_url is not None: parts.append(f'url_to_url={self._url_to_url!r}') if self._url_to_path is not None: parts.append(f'url_to_path={self._url_to_path!r}') return f'FCPath({", ".join(parts)})'
[docs] def __eq__(self, other: object) -> bool: if not isinstance(other, FCPath): return NotImplemented return self._path == other._path
[docs] def __lt__(self, other: object) -> bool: if not isinstance(other, FCPath): return NotImplemented return self._path < other._path
[docs] def __le__(self, other: object) -> bool: if not isinstance(other, FCPath): return NotImplemented return self._path <= other._path
[docs] def __gt__(self, other: object) -> bool: if not isinstance(other, FCPath): return NotImplemented return self._path > other._path
[docs] def __ge__(self, other: object) -> bool: if not isinstance(other, FCPath): return NotImplemented return self._path >= other._path
@property def name(self) -> str: """The final component of the path.""" return FCPath._split(self._path)[1] @property def parent(self) -> FCPath: """The logical parent of the path. The new FCPath will have the same parameters (`filecache`, etc.) as the original path. """ parent = FCPath._split(self._path)[0] if self._path != parent: return FCPath(parent, copy_from=self) return self @property def parents(self) -> tuple[FCPath, ...]: """A sequence of this path's logical parents.""" path = self._path parent = FCPath._split(path)[0] parents = [] while path != parent: parents.append(FCPath(parent, copy_from=self)) path = parent parent = FCPath._split(path)[0] return tuple(parents)
[docs] def is_absolute(self) -> bool: """True if the path is absolute.""" return FCPath._is_absolute(self._path)
[docs] def as_absolute(self) -> FCPath: """Return the absolute version of this possibly-relative path.""" if FCPath._is_absolute(self._path): return self return FCPath(self.as_pathlib().expanduser().absolute().resolve(), copy_from=self)
[docs] def match(self, path_pattern: str | Path | FCPath) -> bool: """Return True if this path matches the given pattern. If the pattern is relative, matching is done from the right; otherwise, the entire path is matched. The recursive wildcard ``**`` is *not* supported by this method (it just acts like ``*``). See pathlib.Path.match for full documentation. """ if not isinstance(path_pattern, FCPath): path_pattern = FCPath(path_pattern) path_parts = self.parts[::-1] pattern_parts = path_pattern.parts[::-1] if not pattern_parts: raise ValueError('empty pattern') if len(path_parts) < len(pattern_parts): return False if len(path_parts) > len(pattern_parts) and path_pattern.anchor: return False globber = _StringGlobber(self) for path_part, pattern_part in zip(path_parts, pattern_parts): match = globber.compile(pattern_part) if match(path_part) is None: return False return True
[docs] def full_match(self, pattern: str | Path | FCPath) -> bool: """Return True if this path matches the given glob-style pattern. The pattern is matched against the entire path. See pathlib.Path.full_match for full documentation. """ if not isinstance(pattern, FCPath): pattern = FCPath(pattern) globber = _StringGlobber(self, recursive=True) match = globber.compile(str(pattern)) return match(self._path) is not None
@property def filecache(self) -> "FileCache": """The FileCache associated with this path.""" from .file_cache import FileCache global _DEFAULT_FILECACHE if self._filecache is None: if _DEFAULT_FILECACHE is None: _DEFAULT_FILECACHE = FileCache() return _DEFAULT_FILECACHE return self._filecache def _make_paths_absolute(self, sub_path: Optional[StrOrPathOrSeqType]) -> str | list[str]: if isinstance(sub_path, (list, tuple)): new_sub_paths: list[str] = [] for p in sub_path: new_sub_path = FCPath._join(self._path, p) if not FCPath._is_absolute(new_sub_path): new_sub_path = (FCPath(Path(new_sub_path) .expanduser().absolute().resolve()) .as_posix()) new_sub_paths.append(new_sub_path) return new_sub_paths new_sub_path = FCPath._join(self._path, sub_path) if not FCPath._is_absolute(new_sub_path): return (FCPath(Path(new_sub_path).expanduser().absolute().resolve()) .as_posix()) return new_sub_path
[docs] def get_local_path(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, create_parents: bool = True, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None, ) -> Path | list[Path]: """Return the local path for the given sub_path relative to this path. Parameters: sub_path: The path of the file relative to this path. If not specified, this path is used. If `sub_path` is a list or tuple, all paths are processed. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. create_parents: If True, create all parent directories. This is useful when getting the local path of a file that will be uploaded. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default value given when this :class:`FCPath` was created. Returns: The Path (or list of Paths) of the URL (possibly as mapped by the `url_to_url` translators) in the cache directory, or as specified by the `url_to_path` translators. The files do not have to exist because a Path could be used for writing a file to upload. To facilitate this, a side effect of this call (if `create_parents` is True) is that the complete parent directory structure will be created for each returned Path. """ new_sub_path = self._make_paths_absolute(sub_path) url_to_url = url_to_url or self._url_to_url url_to_path = url_to_path or self._url_to_path return self.filecache.get_local_path(cast(StrOrPathOrSeqType, new_sub_path), anonymous=self._anonymous, create_parents=create_parents, url_to_url=url_to_url, url_to_path=url_to_path)
[docs] def exists(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, bypass_cache: bool = False, nthreads: Optional[int] = None, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None ) -> bool | list[bool]: """Check if a file exists without downloading it. Parameters: sub_path: The path of the file relative to this path. If not specified, this path is used. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. bypass_cache: If False, check for the file first in the local cache, and if not found there then on the remote server. If True, only check on the remote server. nthreads: The maximum number of threads to use when doing multiple-file retrieval or upload. If None, use the default value given when this :class:`FCPath` was created. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default value given when this :class:`FCPath` was created. Returns: True if the file exists. Note that it is possible that a file could exist and still not be downloadable due to permissions. False if the file does not exist. This includes bad bucket or webserver names, lack of permission to examine a bucket's contents, etc. """ nthreads = self._validate_nthreads(nthreads) new_sub_path = self._make_paths_absolute(sub_path) return self.filecache.exists(cast(StrOrPathOrSeqType, new_sub_path), bypass_cache=bypass_cache, nthreads=nthreads, anonymous=self._anonymous, url_to_url=(url_to_url or self._url_to_url), url_to_path=(url_to_path or self._url_to_path))
[docs] def modification_time(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, bypass_cache: bool = False, nthreads: Optional[int] = None, exception_on_fail: bool = True, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None ) -> float | None | Exception | list[float | None | Exception]: """Get the modification time of a remote file as a Unix timestamp. Parameters: sub_path: The path of the file relative to this path. If not specified, this path is used. If `sub_path` is a list or tuple, all URLs are checked. This may be more efficient because files can be checked in parallel. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. bypass_cache: If False, retrieve the modification time for the file first from the metadata cache, if enabled, and if not found there then from the remote server. If True, only retrieve the modification time directly from the remote server. nthreads: The maximum number of threads to use. If None, use the default value given when this :class:`FCPath` was created. exception_on_fail: If True, if any file does not exist a FileNotFound exception is raised. If False, the function returns normally and any failed check is marked with the Exception that caused the failure in place of the returned modification time. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. Returns: The modification time as a Unix timestamp if the file exists and the time can be retrieved, None otherwise. If `sub_path` was a list or tuple, then instead return a list of modification times in order. This always returns the modification time of the file on the remote source, even if there is a local copy. If you want the modification time of the local copy, you can call the normal ``stat`` function. If `exception_on_fail` is False, any modification time may be an Exception if that file does not exist or the modification time cannot be retrieved. Raises: FileNotFoundError: If a file does not exist. """ nthreads = self._validate_nthreads(nthreads) new_sub_path = self._make_paths_absolute(sub_path) url_to_url = url_to_url or self._url_to_url return (self.filecache .modification_time(cast(StrOrPathOrSeqType, new_sub_path), anonymous=self._anonymous, bypass_cache=bypass_cache, nthreads=nthreads, exception_on_fail=exception_on_fail, url_to_url=url_to_url) )
[docs] def is_dir(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, nthreads: Optional[int] = None, exception_on_fail: bool = True, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None ) -> bool | Exception | list[bool | Exception]: """Check if a path represents a directory. Parameters: sub_path: The path of the directory relative to this path. If not specified, this path is used. If `sub_path` is a list or tuple, all paths are checked. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. nthreads: The maximum number of threads to use for multiple paths. exception_on_fail: If True, if any path cannot be checked a FileNotFound exception is raised. If False, the function returns normally and any failed check is marked with the Exception that caused the failure. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. Returns: True if the path represents a directory, False otherwise. If `sub_path` was a list or tuple, then instead return a list of booleans or exceptions in order. If `exception_on_fail` is False, any result may be an Exception if that path cannot be checked. Raises: FileNotFoundError: If a path cannot be checked. Notes: Unlike ``os.path.isdir`` or `pathlib.Path.is_dir``, this method raises an exception if the URL does not exist instead of returning ``False``. This is so that remote connection errors are not masked by the return value. """ nthreads = self._validate_nthreads(nthreads) new_sub_path = self._make_paths_absolute(sub_path) url_to_url = url_to_url or self._url_to_url return self.filecache.is_dir(cast(StrOrPathOrSeqType, new_sub_path), anonymous=self._anonymous, nthreads=nthreads, exception_on_fail=exception_on_fail, url_to_url=url_to_url)
[docs] def retrieve(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, lock_timeout: Optional[int] = None, nthreads: Optional[int] = None, exception_on_fail: bool = True, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None ) -> Path | Exception | list[Path | Exception]: """Retrieve a file(s) from the given sub_path and store it in the file cache. Parameters: sub_path: The path of the file relative to this path. If not specified, this path is used. If `sub_path` is a list or tuple, the complete list of files is retrieved. Depending on the storage location, this may be more efficient because files can be downloaded in parallel. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. nthreads: The maximum number of threads to use when doing multiple-file retrieval or upload. If None, use the default value given when this :class:`FCPath` was created. lock_timeout: How long to wait, in seconds, if another process is marked as retrieving the file before raising an exception. 0 means to not wait at all. A negative value means to never time out. None means to use the default value given when this :class:`FCPath` was created. exception_on_fail: If True, if any file does not exist or download fails a FileNotFound exception is raised, and if any attempt to acquire a lock or wait for another process to download a file fails a TimeoutError is raised. If False, the function returns normally and any failed download is marked with the Exception that caused the failure in place of the returned Path. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default value given when this :class:`FCPath` was created. Returns: The Path of the filename in the temporary directory (or the original absolute path if local). If `sub_path` was a list or tuple of paths, then instead return a list of Paths of the filenames in the temporary directory (or the original absolute path if local). If `exception_on_fail` is False, any Path may be an Exception if that file does not exist or the download failed or a timeout occurred. Raises: FileNotFoundError: If a file does not exist or could not be downloaded, and exception_on_fail is True. TimeoutError: If we could not acquire the lock to allow downloading of a file within the given timeout or, for a multi-file download, if we timed out waiting for other processes to download locked files, and exception_on_fail is True. Notes: File download is normally an atomic operation; a program will never see a partially-downloaded file, and if a download is interrupted there will be no file present. However, when downloading multiple files at the same time, as many files as possible are downloaded before an exception is raised. """ old_download_counter = self.filecache.download_counter nthreads = self._validate_nthreads(nthreads) if lock_timeout is None: lock_timeout = self._lock_timeout new_sub_path = self._make_paths_absolute(sub_path) url_to_url = url_to_url or self._url_to_url url_to_path = url_to_path or self._url_to_path try: ret = self.filecache.retrieve(cast(StrOrPathOrSeqType, new_sub_path), anonymous=self._anonymous, lock_timeout=lock_timeout, nthreads=nthreads, exception_on_fail=exception_on_fail, url_to_url=url_to_url, url_to_path=url_to_path) finally: self._download_counter += (self.filecache.download_counter - old_download_counter) return ret
[docs] def upload(self, sub_path: Optional[StrOrPathOrSeqType] = None, *, nthreads: Optional[int] = None, exception_on_fail: bool = True, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None ) -> Path | Exception | list[Path | Exception]: """Upload file(s) from the file cache to the storage location(s). Parameters: sub_path: The path of the file relative to this path. If not specified, this path is used. If `sub_path` is a list or tuple, the complete list of files is uploaded. This may be more efficient because files can be uploaded in parallel. If the resulting derived path is not absolute, it is assumed to be a relative local path and is converted to an absolute path by expanding usernames and resolving links. nthreads: The maximum number of threads to use when doing multiple-file retrieval or upload. If None, use the default value given when this :class:`FileCache` was created. exception_on_fail: If True, if any file does not exist or upload fails an exception is raised. If False, the function returns normally and any failed upload is marked with the Exception that caused the failure in place of the returned path. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default value given when this :class:`FCPath` was created. Returns: The Path of the filename in the temporary directory (or the original absolute path if local). If `sub_path` was a list or tuple of paths, then instead return a list of Paths of the filenames in the temporary directory (or the original absolute path if local). If `exception_on_fail` is False, any Path may be an Exception if that file does not exist or the upload failed. Raises: FileNotFoundError: If a file to upload does not exist or the upload failed, and exception_on_fail is True. """ old_upload_counter = self.filecache.upload_counter nthreads = self._validate_nthreads(nthreads) new_sub_path = self._make_paths_absolute(sub_path) url_to_url = url_to_url or self._url_to_url url_to_path = url_to_path or self._url_to_path try: ret = self.filecache.upload(cast(StrOrPathOrSeqType, new_sub_path), anonymous=self._anonymous, nthreads=nthreads, exception_on_fail=exception_on_fail, url_to_url=url_to_url, url_to_path=url_to_path) finally: self._upload_counter += (self.filecache.upload_counter - old_upload_counter) return ret
[docs] @contextlib.contextmanager def open(self, mode: str = 'r', *args: Any, url_to_url: Optional[UrlToUrlFuncOrSeqType] = None, url_to_path: Optional[UrlToPathFuncOrSeqType] = None, **kwargs: Any) -> Iterator[IO[Any]]: """Retrieve+open or open+upload a file as a context manager. If `mode` is a read mode (like ``'r'`` or ``'rb'``) then the file will be first retrieved by calling :meth:`retrieve` and then opened. If the `mode` is a write mode (like ``'w'`` or ``'wb'``) then the file will be first opened for write, and when this context manager is exited the file will be uploaded. Parameters: mode: The mode string as you would specify to Python's `open()` function. **args: Any additional arguments are passed to the Python ``open()`` function. url_to_url: The function (or list of functions) that is used to translate URLs into URLs. A user-specified translator function takes three arguments:: func(scheme: str, remote: str, path: str) -> str where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, and `path` is the rest of the URL. If the translator wants to override the default translation, it can return a new complete URL as a string. Otherwise, it returns None. If more than one translator is specified, they are called in order until one returns a URL, or it falls through to the default. If None, use the default translators for the associated :class:`FileCache` instance. url_to_path: The function (or list of functions) that is used to translate URLs into local paths. By default, :class:`FileCache` uses a directory hierarchy consisting of ``<cache_dir>/<cache_name>/<source>/<path>``, where ``source`` is the URL prefix converted to a filesystem-friendly format (e.g. ``gs://bucket`` is converted to ``gs_bucket``). A user-specified translator function takes five arguments:: func(scheme: str, remote: str, path: str, cache_dir: Path, cache_subdir: str) -> str | Path where `scheme` is the URL scheme (like ``"gs"`` or ``"file"``), `remote` is the name of the bucket or webserver or the empty string for a local file, `path` is the rest of the URL, `cache_dir` is the top-level directory of the cache (``<cache_dir>/<cache_name>``), and `cache_subdir` is the subdirectory specific to this scheme and remote. If the translator wants to override the default translation, it can return a Path. Otherwise, it returns None. If the returned Path is relative, if will be appended to `cache_dir`; if it is absolute, it will be used directly (be very careful with this, as it has the ability to access files outside of the cache directory). If more than one translator is specified, they are called in order until one returns a Path, or it falls through to the default. Note that `url_to_path` operates on the original URL, not the URL generated by a `url_to_url` translator. If None, use the default value given when this :class:`FCPath` was created. **kwargs: Any additional arguments are passed to the Python ``open()`` function. Returns: IO object: The same object as would be returned by the normal `open()` function. """ url_to_url = url_to_url or self._url_to_url url_to_path = url_to_path or self._url_to_path if mode[0] == 'r': local_path = cast(Path, self.retrieve(None, url_to_url=url_to_url, url_to_path=url_to_path)) with open(local_path, mode, *args, **kwargs) as fp: yield fp else: # 'w', 'x', 'a' local_path = cast(Path, self.get_local_path(None, url_to_url=url_to_url, url_to_path=url_to_path)) with open(local_path, mode, *args, **kwargs) as fp: yield fp self.upload(None, url_to_url=url_to_url, url_to_path=url_to_path)
@property def download_counter(self) -> int: """The number of actual file downloads that have taken place.""" return self._download_counter @property def upload_counter(self) -> int: """The number of actual file uploads that have taken place.""" return self._upload_counter
[docs] def is_local(self) -> bool: """True if the path refers to the local filesystem.""" return self._path.startswith('file:///') or '://' not in self._path
[docs] def is_file(self) -> bool: """True if this path exists and is a regular file.""" return cast(bool, self.exists() and not self.is_dir())
[docs] def read_bytes(self, **kwargs: Any) -> bytearray: """Download and open the file in bytes mode, read it, and close the file. Any additional arguments are passed to the Python ``open()`` function. """ with self.open(mode='rb', **kwargs) as f: return cast(bytearray, f.read())
[docs] def read_text(self, **kwargs: Any) -> str: """Download and open the file in text mode, read it, and close the file. Any additional arguments are passed to the Python ``open()`` function. """ with self.open(mode='r', **kwargs) as f: return cast(str, f.read())
[docs] def write_bytes(self, data: Any, **kwargs: Any) -> int: """Open the file in bytes mode, write to it, and close and upload the file. Any additional arguments are passed to the Python ``open()`` function. """ # type-check for the buffer interface before truncating the file view = memoryview(data) with self.open(mode='wb', **kwargs) as f: return f.write(view)
[docs] def write_text(self, data: Any, **kwargs: Any) -> int: """Open the file in text mode, write to it, and close and upload the file. Any additional arguments are passed to the Python ``open()`` function. """ if not isinstance(data, str): raise TypeError('data must be str, not %s' % data.__class__.__name__) with self.open(mode='w', **kwargs) as f: return f.write(data)
[docs] def iterdir(self) -> Iterator[FCPath]: """Yield FCPath objects of the current path's directory contents. The children are yielded in arbitrary order, and the special entries '.' and '..' are not included. """ for obj in self.filecache.iterdir(self._path, url_to_url=self._url_to_url): yield FCPath(obj, copy_from=self)
[docs] def iterdir_metadata(self) -> Iterator[tuple[FCPath, dict[str, Any] | None]]: """Yield FCPath objects of the current directory's contents, with metadata. Yields: All files and sub-directories in the given directory (except ``.`` and ``..``), in no particular order. Each file or directory is represented by a tuple of the form (path, metadata), where path is the path of the file or directory relative to the source prefix, and metadata is a dictionary with the following keys: - ``is_dir``: True if the returned name is a directory, False if it is a file. - ``mtime``: The last modification time of the file as a float. - ``size``: The approximate size of the file in bytes. If the metadata can not be retrieved, None is returned for the metadata. """ for obj, metadata in ( self.filecache.iterdir_metadata(self._path, url_to_url=self._url_to_url)): yield FCPath(obj, copy_from=self), metadata
[docs] def glob(self, pattern: str | Path | FCPath) -> Generator[FCPath]: """Yield all existing files and directories matching the given relative pattern. Notes: If the FCPath is local, then the normal `pathlib.Path.glob()` method is called. If the pattern is only `**`, this function had different behavior before Python 3.13 (only directories returned) and in Python 3.13 and later (both files and directories are returned). In contrast, when the FCPath is remote, we always return all files and directories. To be safe, do not use `**` but instead always use `**/*`. """ if not isinstance(pattern, FCPath): pattern = FCPath(pattern) if pattern.is_absolute(): raise NotImplementedError('Non-relative patterns are unsupported') if self.is_local(): for res in self.as_pathlib().glob(pattern.path): yield FCPath(res, copy_from=self) return parts = pattern.path.split('/') select = _StringGlobber(self, recursive=True).selector(parts[::-1]) for path in select(self.path): yield FCPath(path, copy_from=self)
[docs] def rglob(self, pattern: str | Path | FCPath) -> Generator[FCPath]: """Yield all existing files and directories matching the given relative pattern. This is like calling :meth:`FCPath.glob()` with ``**/`` added in front of the pattern. Notes: If the FCPath is local, then the normal `pathlib.Path.glob()` method is called. If the pattern is only `**`, this function had different behavior before Python 3.13 (only directories returned) and in Python 3.13 and later (both files and directories are returned). In contrast, when the FCPath is remote, we always return all files and directories. To be safe, do not use `**` but instead always use `**/*`. """ if not isinstance(pattern, FCPath): pattern = FCPath(pattern) pattern = '**' / pattern return self.glob(pattern)
[docs] def walk(self, top_down: bool = True ) -> Iterator[tuple[FCPath, list[str], list[str]]]: """Walk the directory tree from this directory. See `pathlib.Path.walk` for full documentation. """ paths: list[FCPath | tuple[FCPath, list[str], list[str]]] = [self] while paths: path = paths.pop() if isinstance(path, tuple): yield path continue dirnames: list[str] = [] filenames: list[str] = [] if not top_down: paths.append((path, dirnames, filenames)) for child, metadata in path.iterdir_metadata(): if metadata is None: continue if metadata['is_dir']: if not top_down: paths.append(child) dirnames.append(child.name) else: filenames.append(child.name) if top_down: yield path, dirnames, filenames paths += [path.joinpath(d) for d in reversed(dirnames)]
[docs] def rename(self, target: str | Path | FCPath) -> FCPath: """Rename this path to the target path. Both the source and target paths must be absolute, and must be in the same location (e.g. both local files or both in the same GS bucket). Because cloud platforms do not support renaming of files, this is accomplished by downloading the source file, uploading it with the new name, and deleting the original version. If the target already exists, it will be overwritten. If the downloading or uploading fails, the copy in the local cache is removed to eliminate ambiguity. If there is only a copy in the local cache and the source path does not exist on the remote, the rename will still succeed by uploading a copy to the target path. Parameters: target: The path to rename to. Returns: The new FCPath instance pointing to the target path. """ if not isinstance(target, FCPath): target = FCPath(target) target = target.as_absolute() src = self.as_absolute() if src.is_local() != target.is_local(): raise ValueError('Unable to rename files between local and remote locations: ' f'{src.path!r} and f{target.path!r}') drive1, root1, subpath1 = FCPath._split_parts(src.path) drive2, root2, subpath2 = FCPath._split_parts(target.path) if drive1 != drive2 or root1 != root2: raise ValueError('Unable to rename files across locations: ' f'{src.path!r} and f{target.path!r}') if src.is_local(): # Local to local - just do an OS rename and be done with it target.parent.mkdir(parents=True, exist_ok=True) src.as_pathlib().rename(target.as_pathlib()) return target # Since you generally can't rename on a remote cloud location, first # download the file, then rename it locally, then upload it to the new name, # then delete the old name local_path = cast(Path, src.retrieve()) target_local_path = cast(Path, target.get_local_path()) local_path.rename(target_local_path) # Rename in the cache try: target.upload() # Upload the new version except Exception: target_local_path.unlink(missing_ok=True) raise src.unlink(missing_ok=True) # Delete the old version return target
[docs] def replace(self, target: str | FCPath) -> FCPath: """Rename this path to the target path, overwriting if that path exists. Both the source and target paths must be absolute, and must be in the same location (e.g. both local files or both in the same GS bucket). Because cloud platforms do not support renaming of files, this is accomplished by downloading the source file, uploading it with the new name, and deleting the original version. If the target already exists, it will be overwritten. Parameters: target: The path to rename to. Returns: The new FCPath instance pointing to the target path. """ return self.rename(target)
if sys.version_info >= (3, 12): def relative_to(self, other: str | Path | FCPath, *, walk_up: bool = False) -> FCPath: """Return the relative path to another path. See `pathlib.Path.relative_to` for full documentation. """ if not isinstance(other, FCPath): other = FCPath(other) if self.is_local(): return FCPath(self.as_pathlib().relative_to(other.as_pathlib(), walk_up=walk_up), copy_from=self) if walk_up: raise NotImplementedError( 'walk_up is not supported for non-local FCPaths') if not self._path.startswith(other._path): raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}") return FCPath(self._path[len(other._path)+1:], copy_from=self) else:
[docs] def relative_to(self, other: str | Path | FCPath) -> FCPath: """Return the relative path to another path. See `pathlib.Path.relative_to` for full documentation. """ if not isinstance(other, FCPath): other = FCPath(other) if self.is_local(): return FCPath(self.as_pathlib().relative_to(other.as_pathlib()), copy_from=self) if not self._path.startswith(other._path): raise ValueError(f"{str(self)!r} is not in the subpath of {str(other)!r}") return FCPath(self._path[len(other._path)+1:], copy_from=self)
[docs] def is_relative_to(self, other: str | Path | FCPath) -> bool: """Return True if the path is relative to another path. See `pathlib.Path.is_relative_to` for full documentation. """ if not isinstance(other, FCPath): other = FCPath(other) return self._path.startswith(other._path)
[docs] def is_reserved(self) -> bool: """True if the path contains a special reserved name. See `pathlib.Path.is_reserved` for full documentation. """ if not self.is_local(): raise NotImplementedError('is_reserved on a remote file is not implemented') return self.as_pathlib().is_reserved()
[docs] def stat(self, *, follow_symlinks: bool = True) -> Any: """Return the result of the stat() system call on this path. Only valid for local files. See `pathlib.Path.stat` for full documentation. """ if not self.is_local(): raise NotImplementedError('stat on a remote file is not implemented; use ' 'modification_time() if you just want that info') return self.as_pathlib().stat(follow_symlinks=follow_symlinks)
[docs] def lstat(self) -> Any: """Like stat(), except if the path points to a symlink, the symlink's status information is returned, rather than its target's. Only valid for local files. See `pathlib.Path.lstat` for full documentation. """ if not self.is_local(): raise NotImplementedError('lstat on a remote file is not implemented') return self.as_pathlib().lstat()
[docs] def is_mount(self) -> bool: """Check if this path is a mount point. Only valid for local directories. See `pathlib.Path.is_mount` for full documentation. """ if not self.is_local(): raise NotImplementedError('is_mount on a remote directory is not implemented') return self.as_pathlib().is_mount()
if sys.version_info >= (3, 12):
[docs] def is_junction(self) -> bool: """Whether this path is a junction. Only valid for local files. See `pathlib.Path.is_junction` for full documentation. """ if not self.is_local(): raise NotImplementedError( 'is_junction on a remote file is not implemented') return self.as_pathlib().is_junction()
[docs] def is_block_device(self) -> bool: """Whether this path is a block device. Only valid for local files. See `pathlib.Path.is_block_device` for full documentation. """ if not self.is_local(): raise NotImplementedError( 'is_block_device on a remote file is not implemented') return self.as_pathlib().is_block_device()
[docs] def is_char_device(self) -> bool: """Whether this path is a character device. Only valid for local files. See `pathlib.Path.is_char_device` for full documentation. """ if not self.is_local(): raise NotImplementedError( 'is_char_device on a remote file is not implemented') return self.as_pathlib().is_char_device()
[docs] def is_fifo(self) -> bool: """Whether this path is a FIFO. Only valid for local files. See `pathlib.Path.is_fifo` for full documentation. """ if not self.is_local(): raise NotImplementedError('is_fifo on a remote file is not implemented') return self.as_pathlib().is_fifo()
[docs] def is_socket(self) -> bool: """Whether this path is a socket. Only valid for local files. See `pathlib.Path.is_socket` for full documentation. """ if not self.is_local(): raise NotImplementedError('is_socket on a remote file is not implemented') return self.as_pathlib().is_socket()
[docs] def samefile(self, other_path: str | Path | FCPath) -> bool: """True if this path and the given path refer to the same file. Unlink the `pathlib.Path.samefile` version, this function only looks to see if the URLs are identical. Thus symlinks, hardlinks, etc. are ignored. """ if not isinstance(other_path, FCPath): other_path = FCPath(other_path) return self._path == other_path._path
[docs] def absolute(self) -> FCPath: """Return an absolute version of this path. For non-local paths, this just returns the URL. For local paths, it does the same operations as `pathlib.Path.absolute`. See `pathlib.Path.absolute` for full documentation. """ if not self.is_local(): return self return FCPath(self.as_pathlib().absolute(), copy_from=self)
[docs] @classmethod def cwd(cls) -> FCPath: """Return a new FCPath pointing to the current working directory. See `pathlib.Path.cwd` for full documentation. """ return FCPath(Path.cwd())
[docs] def expanduser(self) -> FCPath: """Return a new FCPath with expanded ~ and ~user constructs. See `pathlib.Path.expanduser` for full documentation. """ if self.is_local(): return FCPath(self.as_pathlib().expanduser(), copy_from=self) return self
[docs] def expandvars(self) -> FCPath: """Return a new FCPath with expanded environment variables. See `os.path.expandvars` for full documentation. """ return FCPath(os.path.expandvars(self.as_posix()), copy_from=self)
[docs] @classmethod def home(cls) -> FCPath: """Return a new FCPath pointing to expanduser('~'). See `pathlib.Path.home` for full documentation. """ return FCPath(os.path.expanduser('~'))
[docs] def resolve(self, strict: bool = False) -> FCPath: """Return the absolute path with resolved symlinks. See `pathlib.Path.resolve` for full documentation. """ if self.is_local(): return FCPath(self.as_pathlib().absolute().resolve(strict=strict), copy_from=self) return self
[docs] def touch(self, mode: int = 0o666, exist_ok: bool = True) -> None: """Create this file, if it doesn't exist. See `pathlib.Path.touch` for full documentation. """ if self.is_local(): self.as_pathlib().touch(mode=mode, exist_ok=exist_ok) if not self.exists(): self.write_bytes(b'') else: # Read and write the file to update the creation time self.retrieve() self.upload()
[docs] def mkdir(self, mode: int = 0o777, parents: bool = False, exist_ok: bool = False) -> None: """Create a new directory at this given path. Only valid for local directories. See `pathlib.Path.mkdir` for full documentation. """ if not self.is_local(): raise NotImplementedError('mkdir on a remote directory is not implemented') self.as_pathlib().mkdir(mode=mode, parents=parents, exist_ok=exist_ok)
[docs] def chmod(self, mode: int, *, follow_symlinks: bool = True) -> None: """Change the permissions of the path, like os.chmod(). Only valid for local files. See `pathlib.Path.chmod` for full documentation. """ if not self.is_local(): raise NotImplementedError('chmod on a remote file is not implemented') self.as_pathlib().chmod(mode=mode, follow_symlinks=follow_symlinks)
[docs] def lchmod(self, mode: int) -> None: """Like chmod(), except if the path points to a symlink, the symlink's permissions are changed, rather than its target's. Only valid for local files. See `pathlib.Path.lchmod` for full documentation. """ if not self.is_local(): raise NotImplementedError('lchmod on a remote file is not implemented') self.as_pathlib().lchmod(mode=mode)
[docs] def rmdir(self) -> None: """Remove this directory. The directory must be empty. Only valid for local directories. See `pathlib.Path.rmdir` for full documentation. """ if not self.is_local(): raise NotImplementedError('rmdir on a remote directory is not implemented') self.as_pathlib().rmdir()
if sys.version_info >= (3, 13): def owner(self, *, follow_symlinks: bool = True) -> str: """Return the login name of the file owner. Only valid for local files. See `pathlib.Path.owner` for full documentation. """ if not self.is_local(): raise NotImplementedError('owner on a remote file is not implemented') return self.as_pathlib().owner(follow_symlinks=follow_symlinks) def group(self, *, follow_symlinks: bool = True) -> str: """Return the group name of the file gid. Only valid for local files. See `pathlib.Path.group` for full documentation. """ if not self.is_local(): raise NotImplementedError('group on a remote file is not implemented') return self.as_pathlib().group(follow_symlinks=follow_symlinks) else:
[docs] def owner(self) -> str: """Return the login name of the file owner. Only valid for local files. See `pathlib.Path.owner` for full documentation. """ if not self.is_local(): raise NotImplementedError('owner on a remote file is not implemented') return self.as_pathlib().owner()
[docs] def group(self) -> str: """Return the group name of the file gid. Only valid for local files. See `pathlib.Path.group` for full documentation. """ if not self.is_local(): raise NotImplementedError('group on a remote file is not implemented') return self.as_pathlib().group()
[docs] @classmethod def from_uri(cls, uri: str) -> FCPath: """Return a new FCPath from the given URI.""" return FCPath(uri)
[docs] def as_uri(self) -> str: """Return the path as a URI.""" if not self.is_absolute(): raise ValueError("relative path can't be expressed as a file URI") if not self.is_local() or self.path.startswith('file://'): return self._path drive, root, subpath = FCPath._split_parts(self._path) if len(drive) == 2: # It's a path on a local drive => 'file:///c:/a/b' return f'file:///{self._path}' elif drive: # It's a path on a network drive => 'file://host/share/a/b' return f'file:{self._path}' # It's a posix path => 'file:///etc/hosts' return f'file://{self._path}'
def _translate2(pat: str, STAR: str, QUESTION_MARK: str) -> list[str]: res: list[str] = [] add = res.append i, n = 0, len(pat) while i < n: c = pat[i] i = i+1 if c == '*': # compress consecutive `*` into one if (not res) or res[-1] is not STAR: add(STAR) elif c == '?': add(QUESTION_MARK) elif c == '[': j = i if j < n and pat[j] == '!': j = j+1 if j < n and pat[j] == ']': j = j+1 while j < n and pat[j] != ']': j = j+1 if j >= n: add('\\[') else: stuff = pat[i:j] if '-' not in stuff: stuff = stuff.replace('\\', r'\\') else: chunks = [] k = i+2 if pat[i] == '!' else i+1 while True: k = pat.find('-', k, j) if k < 0: break chunks.append(pat[i:k]) i = k+1 k = k+3 chunk = pat[i:j] if chunk: chunks.append(chunk) else: chunks[-1] += '-' # Remove empty ranges -- invalid in RE. for k in range(len(chunks)-1, 0, -1): if chunks[k-1][-1] > chunks[k][0]: chunks[k-1] = chunks[k-1][:-1] + chunks[k][1:] del chunks[k] # Escape backslashes and hyphens for set difference (--). # Hyphens that create ranges shouldn't be escaped. stuff = '-'.join(s.replace('\\', r'\\').replace('-', r'\-') for s in chunks) # Escape set operations (&&, ~~ and ||). stuff = re.sub(r'([&~|])', r'\\\1', stuff) i = j+1 if not stuff: # Empty range: never match. add('(?!)') elif stuff == '!': # Negated empty range: match any character. add('.') else: if stuff[0] == '!': stuff = '^' + stuff[1:] elif stuff[0] in ('^', '['): stuff = '\\' + stuff add(f'[{stuff}]') else: add(re.escape(c)) assert i == n return res magic_check = re.compile('([*?[])') magic_check_bytes = re.compile(b'([*?[])') def _translate(pat: str, *, recursive: bool = False) -> str: """Translate a pathname with shell wildcards to a regular expression. If `recursive` is true, the pattern segment '**' will match any number of path segments. """ not_sep = '[^/]' one_last_segment = f'[^/.]{not_sep}*' one_segment = f'{one_last_segment}/' any_segments = f'(?:{one_segment})*' any_last_segments = f'{any_segments}(?:{one_last_segment})?' results = [] parts = re.split('/', pat) last_part_idx = len(parts) - 1 for idx, part in enumerate(parts): if part == '*': results.append(one_segment if idx < last_part_idx else one_last_segment) elif recursive and part == '**': if idx < last_part_idx: if parts[idx + 1] != '**': results.append(any_segments) else: results.append(any_last_segments) else: if part: if part[0] in '*?': results.append(r'(?!\.)') results.extend(_translate2(part, f'{not_sep}*', not_sep)) if idx < last_part_idx: results.append('/') res = ''.join(results) return fr'(?s:{res})\Z' @functools.lru_cache(maxsize=512) def _compile_pattern(pat: str, recursive: bool = True) -> Any: """Compile given glob pattern to a re.Pattern object (observing case sensitivity).""" regex = _translate(pat, recursive=recursive) return re.compile(regex).match class _StringGlobber: """Class providing shell-style pattern matching and globbing. """ def __init__(self, copy_from: FCPath, recursive: bool = False) -> None: self.recursive = recursive self.copy_from = copy_from # High-level methods def compile(self, pat: str) -> Any: return _compile_pattern(pat, self.recursive) def selector(self, parts: list[str]) -> Any: """Returns a function that selects from a given path, walking and filtering according to the glob-style pattern parts in *parts*. """ if not parts: return self.select_exists part = parts.pop() if self.recursive and part == '**': selector = self.recursive_selector else: selector = self.wildcard_selector return selector(part, parts) def wildcard_selector(self, part: str, parts: list[str]) -> Callable[[str, bool], Generator[FCPath]]: """Returns a function that selects direct children of a given path, filtering by pattern. """ match = None if part == '*' else self.compile(part) dir_only = bool(parts) if dir_only: select_next = self.selector(parts) def select_wildcard(path: str, exists: bool = False) -> Generator[FCPath]: entries = list(FCPath(path, copy_from=self.copy_from).iterdir_metadata()) for entry, metadata in entries: if metadata is None: continue if match is None or match(entry.name): if dir_only and not metadata['is_dir']: continue if dir_only: yield from select_next(entry, exists=True) else: yield entry return select_wildcard def recursive_selector(self, part: str, parts: list[str]) -> Callable[[str, bool], Generator[FCPath]]: """Returns a function that selects a given path and all its children, recursively, filtering by pattern. """ # Optimization: consume following '**' parts, which have no effect. while parts and parts[-1] == '**': parts.pop() # Optimization: consume and join any following non-special parts here, # rather than leaving them for the next selector. They're used to # build a regular expression, which we use to filter the results of # the recursive walk. As a result, non-special pattern segments # following a '**' wildcard don't require additional filesystem access # to expand. match = None if part == '**' else self.compile(part) dir_only = bool(parts) select_next = self.selector(parts) def select_recursive(path: str, exists: bool = False) -> Generator[FCPath]: path_str = str(path) if path_str and path_str[-1] != '/': path_str = f'{path_str}/' match_pos = len(path_str) if match is None or match(path_str, match_pos): yield from select_next(path_str, exists) stack = [path_str] while stack: yield from select_recursive_step(stack, match_pos) def select_recursive_step(stack: list[str], match_pos: int) -> Generator[Any]: path = stack.pop() entries = list(FCPath(path, copy_from=self.copy_from).iterdir_metadata()) for entry, metadata in entries: if metadata is None: continue if metadata['is_dir'] or not dir_only: if match is None or match(str(entry), match_pos): if dir_only: yield from select_next(entry, exists=True) else: # Optimization: directly yield the path if this is # last pattern part. yield entry if metadata['is_dir']: stack.append(entry.path) return select_recursive def select_exists(self, path: str, exists: bool = False) -> Generator[str]: """Yields the given path, if it exists. """ if exists: # Optimization: this path is already known to exist, e.g. because # it was returned from os.iterdir(), so we skip calling exists(). yield path else: if FCPath(path, copy_from=self.copy_from).exists(): yield path