Skip to content

Git

Git helper functions.

ChangeType (Enum)

An enumeration.

ConflictFile dataclass

Container for path and different versions of conflicted notebooks.

Source code in databooks/git_utils.py
class ConflictFile:
    """Container for path and different versions of conflicted notebooks."""

    filename: Path
    first_log: str
    last_log: str
    first_contents: str
    last_contents: str

Contents dataclass

Container for path of file versions.

Source code in databooks/git_utils.py
class Contents:
    """Container for path of file versions."""

    path: Optional[Path]
    contents: Optional[str]

DiffContents dataclass

Container for path and different versions of conflicted notebooks.

Source code in databooks/git_utils.py
class DiffContents:
    """Container for path and different versions of conflicted notebooks."""

    a: Contents
    b: Contents
    change_type: ChangeType

UnmergedBlob dataclass

Container for git unmerged blobs.

Source code in databooks/git_utils.py
class UnmergedBlob:
    """Container for git unmerged blobs."""

    filename: Path
    stage: Dict[int, Blob]

blob2commit(blob, repo)

Get the short commit message from blob hash.

Source code in databooks/git_utils.py
def blob2commit(blob: Blob, repo: Repo) -> str:
    """Get the short commit message from blob hash."""
    _git = Git(working_dir=repo.working_dir)
    commit_id = _git.log(find_object=blob, max_count=1, all=True, oneline=True)
    return (
        commit_id
        if len(commit_id) > 0
        else _git.stash("list", "--oneline", "--max-count", "1", "--find-object", blob)
    )

blob2str(blob)

Get the blob contents if they exist (otherwise return None).

Source code in databooks/git_utils.py
def blob2str(blob: Optional[Blob]) -> Optional[str]:
    """Get the blob contents if they exist (otherwise return `None`)."""
    return blob.data_stream.read() if blob is not None else None

diff2contents(blob, ref, path, not_exists=False)

Get the blob contents from the diff.

Depends on whether we are diffing against current working tree and if object exists at diff time (added or deleted objects only exist at one side). If comparing against working tree (ref=None) we return the current file contents.

Parameters:

Name Type Description Default
blob Blob

git diff blob

required
ref Union[git.objects.tree.Tree, git.objects.commit.Commit, str]

git reference

required
path Path

path to object

required
not_exists bool

whether object exists at 'diff time' (added or removed objects do not exist)

False

Returns:

Type Description
Optional[str]

blob contents as a string (if exists)

Source code in databooks/git_utils.py
def diff2contents(
    blob: Blob,
    ref: Optional[Union[Tree, Commit, str]],
    path: Path,
    not_exists: bool = False,
) -> Optional[str]:
    """
    Get the blob contents from the diff.

    Depends on whether we are diffing against current working tree and if object exists
     at diff time (added or deleted objects only exist at one side). If comparing
     against working tree (`ref=None`) we return the current file contents.
    :param blob: git diff blob
    :param ref: git reference
    :param path: path to object
    :param not_exists: whether object exists at 'diff time' (added or removed objects
     do not exist)
    :return: blob contents as a string (if exists)
    """
    if not_exists:
        return None
    elif ref is None:
        return path.read_text()
    else:
        return blob2str(blob)

get_conflict_blobs(repo)

Get the source files for conflicts.

Source code in databooks/git_utils.py
def get_conflict_blobs(repo: Repo) -> List[ConflictFile]:
    """Get the source files for conflicts."""
    unmerged_blobs = repo.index.unmerged_blobs()
    blobs = (
        UnmergedBlob(filename=Path(k), stage=dict(v))
        for k, v in unmerged_blobs.items()
        if 0 not in dict(v).keys()  # only get blobs that could not be merged
    )

    if not isinstance(repo.working_dir, (Path, str)):
        raise RuntimeError(
            "Expected `repo` to be `pathlib.Path` or `str`, got"
            f" {type(repo.working_dir)}."
        )
    return [
        ConflictFile(
            filename=repo.working_dir / blob.filename,
            first_log=blob2commit(blob=blob.stage[2], repo=repo),
            last_log=blob2commit(blob=blob.stage[3], repo=repo),
            first_contents=blob2str(blob.stage[2]),
            last_contents=blob2str(blob.stage[3]),
        )
        for blob in blobs
    ]

get_nb_diffs(ref_base=None, ref_remote=None, paths=(), *, repo=None, verbose=False)

Get the noteebook(s) git diff(s).

By default, diffs are compared with the current working directory. That is, staged files will still show up in the diffs. Only return the diffs for notebook files.

Source code in databooks/git_utils.py
def get_nb_diffs(
    ref_base: Optional[str] = None,
    ref_remote: Optional[str] = None,
    paths: Sequence[Path] = (),
    *,
    repo: Optional[Repo] = None,
    verbose: bool = False,
) -> List[DiffContents]:
    """
    Get the noteebook(s) git diff(s).

    By default, diffs are compared with the current working directory. That is, staged
     files will still show up in the diffs. Only return the diffs for notebook files.
    """
    if verbose:
        set_verbose(logger)

    common_path = find_common_parent(paths or [Path.cwd()])
    repo = get_repo(path=common_path) if repo is None else repo
    if repo is None or repo.working_dir is None:
        raise ValueError("No repo found - cannot compute diffs.")

    ref_base = repo.index if ref_base is None else repo.tree(ref_base)
    ref_remote = ref_remote if ref_remote is None else repo.tree(ref_remote)

    logger.debug(
        f"Looking for diffs on path(s) {[p.resolve() for p in paths]}.\n"
        f"Comparing `{ref_base}` and `{ref_remote}`."
    )
    repo_root_dir = Path(repo.working_dir)
    return [
        DiffContents(
            a=Contents(
                path=Path(d.a_path),
                contents=diff2contents(
                    blob=cast(Blob, d.a_blob),
                    ref=ref_base,
                    path=repo_root_dir / d.a_path,
                    not_exists=d.change_type is ChangeType.A,  # type: ignore
                ),
            ),
            b=Contents(
                path=Path(d.b_path),
                contents=diff2contents(
                    blob=cast(Blob, d.b_blob),
                    ref=ref_remote,
                    path=repo_root_dir / d.b_path,
                    not_exists=d.change_type is ChangeType.D,  # type: ignore
                ),
            ),
            change_type=ChangeType[d.change_type],
        )
        for d in ref_base.diff(
            other=ref_remote, paths=list(paths) or list(repo_root_dir.rglob("*.ipynb"))
        )
    ]

get_repo(path)

Find git repo in current or parent directories.

Source code in databooks/git_utils.py
def get_repo(path: Path) -> Optional[Repo]:
    """Find git repo in current or parent directories."""
    repo_dir = find_obj(
        obj_name=".git", start=Path(path.anchor), finish=path, is_dir=True
    )
    if repo_dir is not None:
        repo = Repo(path=repo_dir)
        logger.debug(f"Repo found at: {repo.working_dir}.")
        return repo
    else:
        logger.debug(f"No repo found at {path}.")