Source code for credsweeper.ml_model.features

"""Most rules are described in 'Secrets in Source Code: Reducing False Positives Using Machine Learning'."""

import os.path
from abc import ABC, abstractmethod
from typing import List, Any

import numpy as np
from scipy.sparse.csr import csr_matrix
from sklearn.preprocessing import LabelBinarizer

from credsweeper.common.constants import Chars
from credsweeper.credentials import Candidate


[docs]class Feature(ABC): """Base class for features.""" def __call__(self, candidates: List[Candidate]) -> List[bool]: """Call base class for features. Args: candidates: list of candidates to extract features """ return [self.extract(candidate) for candidate in candidates]
[docs] @abstractmethod def extract(self, candidate: Candidate) -> Any: raise NotImplementedError
[docs]class WordInSecret(Feature): """Feature returns true if candidate value contains at least one word from predefined list.""" def __init__(self, words: List[str]) -> None: """Feature is true if candidate value contains at least one predefined word. Args: words: list of predefined words """ self.words = words
[docs] def extract(self, candidate: Candidate) -> bool: return any(w.lower() in candidate.line_data_list[0].value.lower() for w in self.words)
[docs]class WordInLine(Feature): """Feature is true if line contains at least one word from predefined list.""" def __init__(self, words: List[str]) -> None: """Feature is true if line contains at least one predefined word. Args: words: list of predefined words """ self.words = words
[docs] def extract(self, candidate: Candidate) -> bool: return any(w.lower() in candidate.line_data_list[0].line.lower() for w in self.words)
[docs]class WordInPath(Feature): """Feature is true if candidate path contains at least one word from predefined list.""" def __init__(self, words: List[str]) -> None: """Feature is true if candidate path contains at least one predefined word. Args: words: list of predefined words """ self.words = words
[docs] def extract(self, candidate: Candidate) -> bool: return any(c.lower() in candidate.line_data_list[0].path.lower() for c in self.words)
[docs]class HasHtmlTag(Feature): """Feature is true if line has HTML tags (HTML file).""" def __init__(self) -> None: self.word_in_line = WordInLine( ['< img', '<img', '< script', '<script', '< p', '<p', '< link', '<link', '< meta', '<meta', '< a', '<a'])
[docs] def extract(self, candidate: Candidate) -> bool: tag_closings = ["<", "/>"] return self.word_in_line.extract(candidate) | all(c in candidate.line_data_list[0].line for c in tag_closings)
[docs]class PossibleComment(Feature): r"""Feature is true if candidate line starts with #,\*,/\*? (Possible comment)."""
[docs] def extract(self, candidate: Candidate) -> bool: comment_symbols = ["#", "*", "/*"] return any(candidate.line_data_list[0].line.startswith(s) for s in comment_symbols)
[docs]class IsSecretNumeric(Feature): """Feature is true if candidate value is a numerical value."""
[docs] def extract(self, candidate: Candidate) -> bool: try: float(candidate.line_data_list[0].value) return True except ValueError: return False
[docs]class RenyiEntropy(Feature): """Renyi entropy. See next link for details: https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf Parameters: CHARS: Number base alpha: entropy parameter norm: set True to normalize output probabilities """ CHARS = { 'hex': "1234567890abcdefABCDEF", 'base36': "abcdefghijklmnopqrstuvwxyz1234567890", 'base64': "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=", } def __init__(self, base: Chars, alpha: float, norm=False) -> None: """Renyi entropy class initializer. Args: base: number base type alpha: entropy parameter norm: set True to normalize output probabilities, default is False """ self.base = base self.alpha = alpha self.norm = norm
[docs] def extract(self, candidate: Candidate) -> np.ndarray: p_x = self.get_probabilities(candidate.line_data_list[0].value) return self.estimate_entropy(p_x)
[docs] def get_probabilities(self, data: str) -> np.ndarray: """Get list of alphabet's characters presented in inputted string.""" unique_elements = [x for x in ShannonEntropy.CHARS[self.base] if data.count(x) > 0] # perform estimation of probability of characters p_x = np.array([float(data.count(x)) / len(data) for x in unique_elements]) # get probabilities for alphabet's characters presented in data p_x = p_x[p_x > 0] # linear weighting of probabilities for theirs normalization if self.norm: p_x /= p_x.sum() return p_x
[docs] def estimate_entropy(self, p_x: np.ndarray) -> float: """Calculate Renyi entropy of 'p_x' sequence. Function is based on definition of Renyi entropy for arbitrary probability distribution. Please see next link for details: https://digitalassets.lib.berkeley.edu/math/ucb/text/math_s4_v1_article-27.pdf """ if 0 == len(p_x): entropy = 0 elif np.abs(0.0 - self.alpha) < np.finfo(np.float32).eps: # corresponds to Hartley or max-entropy entropy = np.log2(p_x.size) elif np.abs(1.0 - self.alpha) < np.finfo(np.float32).eps: # corresponds to Shannon entropy entropy = np.sum(-p_x * np.log2(p_x)) else: entropy = np.log2((p_x ** self.alpha).sum()) / (1.0 - self.alpha) return entropy
[docs]class ShannonEntropy(RenyiEntropy): """Shannon entropy feature.""" def __init__(self, base: Chars, norm: bool = False) -> None: super().__init__(base, 1.0, norm)
[docs]class HartleyEntropy(RenyiEntropy): """Hartley entropy feature.""" def __init__(self, base: Chars, norm: bool = False) -> None: super().__init__(base, 0.0, norm)
[docs]class FileExtension(Feature): """Categorical feature of file type. Parameters: extensions: extension labels """ def __init__(self, extensions: List[str]) -> None: self.extensions = extensions def __call__(self, candidates: List[Candidate]) -> csr_matrix: enc = LabelBinarizer() enc.fit(self.extensions) extensions = [os.path.splitext(candidate.line_data_list[0].path)[1] for candidate in candidates] return enc.transform(extensions)
[docs] def extract(self, candidate: Candidate) -> Any: raise NotImplementedError
[docs]class RuleName(Feature): """Categorical feature that corresponds to rule name. Parameters: rule_names: rule name labels """ def __init__(self, rule_names: List[str]) -> None: self.rule_names = rule_names def __call__(self, candidates: List[Candidate]) -> csr_matrix: enc = LabelBinarizer() enc.fit(self.rule_names) rule_names = [candidate.rule_name for candidate in candidates] return enc.transform(rule_names)
[docs] def extract(self, candidate: Candidate) -> Any: raise NotImplementedError