Source code for credsweeper.file_handler.file_path_extractor

import os
from pathlib import Path
from typing import List

from git import InvalidGitRepositoryError, NoSuchPathError, Repo

from credsweeper.config import Config
from credsweeper.utils import Util


[docs]class FilePathExtractor: located_repos = {}
[docs] @classmethod def apply_gitignore(cls, detected_files: List[str]) -> List[str]: """Apply gitignore rules for each file. Args: detected_files: list of files to be checked Return: List of files with all files ignored by git removed """ filtered_files = [file_path for file_path in detected_files if FilePathExtractor.is_valid_path(file_path)] return filtered_files
[docs] @classmethod def get_file_paths(cls, config: Config, path: str) -> List[str]: """Get all files in the directory. Automatically exclude files non-code or data files (such as .jpg). Args: config: credsweeper configuration path: path to the file or directory to be scanned Return: List all non-excluded files in the directory """ path = os.path.expanduser(path) # Replace ~ character with a full path to the home directory file_paths = [] if os.path.isfile(path): if not FilePathExtractor.check_exclude_file(config, path): file_paths.append(path) return file_paths for dirpath, _, filenames in os.walk(path): for filename in filenames: file_path = f"{dirpath}/{filename}" if FilePathExtractor.check_exclude_file(config, file_path): continue if os.path.isfile(file_path): file_paths.append(file_path) return file_paths
[docs] @classmethod def is_valid_path(cls, path: str) -> bool: """Locate nearest .git directory to the path and check if path is ignored. Args: path: path to the file or directory to check Return: False if file is ignored by git. True otherwise """ parent_directory = Path(path).parent # Iterate over file path to find nearest ".git" directory while True: try: if parent_directory in cls.located_repos: repo = cls.located_repos[parent_directory] else: # The directory must have ".git" in it. If not it occurs error. repo = Repo(parent_directory) # Cache already located repositories, so we would not need to load it for each new file cls.located_repos[parent_directory] = repo # Return True if there is no ignored file in 'path' and False if any. return len(repo.ignored(path)) == 0 except (InvalidGitRepositoryError, NoSuchPathError): new_parent = parent_directory.parent # If we encountered root and cannot move further: no .git directory located in the entire path if new_parent == parent_directory: return True parent_directory = new_parent
[docs] @classmethod def check_exclude_file(cls, config: Config, path: str) -> bool: if config.not_allowed_path_pattern.match(path): return True if any(exclude_pattern.match(path) for exclude_pattern in config.exclude_patterns): return True if any(exclude_path in path for exclude_path in config.exclude_paths): return True if Util.get_extension(path) in config.exclude_extensions: return True return False