MVP for testing

2024-04-22 06:27:20 -07:00 · 2023-07-16 09:30:36 -07:00 · 2023-07-16 09:30:36 -07:00 · 9bc6f8d9e1
commit 9bc6f8d9e1
parent 3396d2d69b
11 changed files with 459 additions and 3 deletions
--- a/.flake8
+++ b/.flake8
@ -0,0 +1,3 @@
 [flake8]
 max-line-length = 160
 exclude = docs/*, .git, __pycache__, build
--- a/melamine/classes.py
+++ b/melamine/classes.py
@ -0,0 +1,124 @@
 import asyncio
 import hashlib
 from collections.abc import Generator
 from pathlib import Path
 from secrets import token_bytes
 from typing import List
 from typing import Union
 import aiofiles
 from .fileops import find_mount
 from .logs import logger
 class ShredDir:
    """Class for tracking each directory to be shredded, and its contents."""
    def __init__(self, path: Path) -> None:
        self.absolute_path = path.resolve()
        self.processed = False
        self.contents = self._get_contents()
        self.byte_size = sum(item.byte_size for item in self.contents)
        self.mount_points = set(m for m in self.get_mount_points())
        self.mount_points.add(find_mount(self.absolute_path))
        self.fs_handler = None
    def _get_contents(self) -> List:
        contents = []
        for subpath in self.absolute_path.glob("*"):
            if subpath.is_dir():
                if subpath.is_symlink():
                    logger.warning(f"Symlink subdirectory found: {subpath}, skipping")
                    continue
                contents.append(ShredDir(subpath))
            elif subpath.is_file():
                contents.append(ShredFile(subpath))
        return contents
    def get_mount_points(self) -> Generator:
        for item in self.contents:
            if isinstance(item, ShredDir):
                yield from item.get_mount_points()
        yield self.mount_point
    async def shred(self, hash: bool = False, dryrun: bool = False) -> bool:
        tasks = []
        for item in self.contents:
            tasks.append(item.shred(hash, dryrun))
        return all(await asyncio.gather(*tasks))
    def __hash__(self) -> int:
        return hash(self.absolute_path)
 class ShredFile:
    """Class for tracking each file to be shredded."""
    def __init__(self, path: Path) -> None:
        self.absolute_path = path.resolve()
        self.byte_size = path.stat().st_size
        self.processed = False
        self.mount_point = find_mount(self.absolute_path)
        self.fs_handler = None
        self.hardlinks = None
    async def shred(self, hash: bool = False, dryrun: bool = False) -> Union[bool, bytes]:
        """Shred the file with a single file descriptor."""
        if not self.processed:
            logger.error(f"File {self.absolute_path} tried to shred early.")
            return False
        try:
            logger.info(f"Shredding file: {self.absolute_path}")
            async with aiofiles.open(self.absolute_path, "rb+") as file:
                if hash:
                    sha1 = hashlib.sha1(usedforsecurity=False)
                    async for chunk in aiofiles.iterate(file):
                        sha1.update(chunk)
                        self.sha1 = sha1.digest()
                        logger.info(f"Got hash {sha1.hexdigest()}")
            # First pass: Overwrite with binary zeroes
            logger.info("Performing first pass: Overwriting with binary zeroes")
            await file.seek(0)
            if not dryrun:
                await file.write(b"\x00" * self.byte_size)
            await file.flush()
            # Second pass: Overwrite with binary ones
            logger.info("Performing second pass: Overwriting with binary ones")
            await file.seek(0)
            if not dryrun:
                await file.write(b"\xff" * self.byte_size)
            await file.flush()
            # Third pass: Overwrite with random data
            logger.info("Performing third pass: Overwriting with random data")
            await file.seek(0)
            random_data = token_bytes(self.byte_size)
            if not dryrun:
                await file.write(random_data)
            await file.flush()
            # Remove the file
            logger.info(f"Removing file {self.absolute_path}")
            if not dryrun:
                file.unlink()
            # Remove any hardlinks
            if self.hardlinks:
                logger.info(f"Removing {len(self.hardlinks)} hardlinks")
                if not dryrun:
                    for link in self.hardlinks:
                        link.unlink()
            return True
        except Exception as e:
            logger.error(f"File wipe failed: {e}")
            return False
    def __hash__(self) -> int:
        return hash(self.absolute_path)
--- a/melamine/cli.py
+++ b/melamine/cli.py
@ -0,0 +1,56 @@
 import asyncio
 from argparse import ArgumentParser
 import uvloop
 from .shred import main
 from .validators import *
 # flake8: noqa: E501
 def run() -> None:
    validate_environment()
    parser = ArgumentParser(description="Comprehensive DoD 5220.22-M file shredder for Linux.")
    parser.add_argument(
        "--recursive", "-r", action="store_true", help="Process directories recursively. Default is false."
    )
    parser.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts. Default is false.")
    parser.add_argument(
        "--dryrun", "-d", action="store_true", help="Provide mock output without deleting anything. Default is false."
    )
    parser.add_argument(
        "--exhaustive",
        "-e",
        action="store_true",
        help="Exhaustively check local mounts for duplicate files by hash. Default is false.",
    )
    parser.add_argument(
        "--ignoredir",
        "-i",
        action="append",
        type=validate_file_folder,
        default=[],
        help="Specify directories to be ignored during the process. This option can be used multiple times.",
    )
    parser.add_argument("--logfile", "-o", type=validate_logfile, help="Specify a file to log all output.")
    parser.add_argument("--quiet", "-q", action="store_true", help="Silence all output.")
    parser.add_argument("--verbose", "-v", action="store_true", help="Provide extra output for debugging.")
    parser.add_argument(
        "paths",
        nargs="+",
        type=validate_file_folder,
        help="Specify any number of existing files or directories to be processed.",
    )
    args = parser.parse_args()
    if sys.version_info >= (3, 11):
        with asyncio.Runner(loop_factory=uvloop.new_event_loop) as runner:
            runner.run(main(args))
    else:
        uvloop.install()
        asyncio.run(main(args))
 if __name__ == "__main__":
    run()
--- a/melamine/fileops.py
+++ b/melamine/fileops.py
@ -0,0 +1,50 @@
 import asyncio
 from pathlib import Path
 from typing import List
 from asyncstdlib.functools import lru_cache
 from .filesystems import FSHandlers
 from .logs import logger
 def find_mount(path: Path) -> Path:
    """Find the mount point for a given path."""
    path = path.absolute()
    while not path.is_mount():
        path = path.parent
    return path
 def get_all_mounts() -> List:
    """Get a list of all mounted filesystems."""
    mounts = []
    with open("/proc/mounts", "r") as f:
        for line in f:
            mount = line.split()[1]
            mounts.append(mount)
    return mounts
@lru_cache(maxsize=1024)
 async def mount_to_fs_handler(path: Path) -> str:
    # TODO: This is a hacky way to get the filesystem type, but it works for now.
    # Maybe with libblkid Python bindings?
    proc = await asyncio.create_subprocess_exec(
        "stat", "-f", "-L", "-c", "%T", str(path), stdout=asyncio.subprocess.PIPE, stdin=asyncio.subprocess.PIPE
    )
    stdout, _ = await proc.communicate()
    if proc.returncode != 0:
        err = f"Unable to get filesystem for {path}"
        logger.error(err)
        raise RuntimeError(err)
    fs = stdout.decode().strip()
    try:
        return FSHandlers[fs]
    except KeyError:
        err = f"Unsupported filesystem: {fs}"
        logger.error(err)
        raise RuntimeError(err)
--- a/melamine/filesystems/init.py
+++ b/melamine/filesystems/init.py
@ -0,0 +1,7 @@
 from .ext23 import EXT23Handler
 from .zfs import ZFSHandler
 # from .btrfs import BTRFSHandler
 # from .ext4 import EXT4Handler
 FSHandlers = {"zfs": ZFSHandler(), "ext2/ext3": EXT23Handler()}
--- a/melamine/filesystems/ext23.py
+++ b/melamine/filesystems/ext23.py
@ -0,0 +1,78 @@
 import ctypes
 from collections.abc import Generator
 from pathlib import Path
 class ext2_filsys(ctypes.Structure):
    pass
 class ext2_inode_scan(ctypes.Structure):
    pass
 class ext2_inode_large(ctypes.Structure):
    _fields_ = [
        ("i_mode", ctypes.c_uint16),
        ("i_uid", ctypes.c_uint16),
        ("i_size", ctypes.c_uint32),
        ("i_atime", ctypes.c_uint32),
        ("i_ctime", ctypes.c_uint32),
        ("i_mtime", ctypes.c_uint32),
        ("i_dtime", ctypes.c_uint32),
        ("i_gid", ctypes.c_uint16),
        ("i_links_count", ctypes.c_uint16),
        ("i_blocks", ctypes.c_uint32),
        ("i_flags", ctypes.c_uint32),
        ("i_osd1", ctypes.c_uint32 * 3),
        ("i_block", ctypes.c_uint32 * 15),
        ("i_generation", ctypes.c_uint32),
        ("i_file_acl", ctypes.c_uint32),
        ("i_dir_acl", ctypes.c_uint32),
        ("i_faddr", ctypes.c_uint32),
        ("i_osd2", ctypes.c_uint8 * 12),
    ]
 class ext2_inode_large_p(ctypes.POINTER(ext2_inode_large)):
    pass
 class EXT23Handler:
    def __init__(self, fs: str) -> None:
        self.fs = "ext2/ext3"
        self.libext2fs = ctypes.CDLL("libext2fs.so.2")
        self.libext2fs.ext2fs_open.restype = ctypes.c_int
        self.libext2fs.ext2fs_open.argtypes = [
            ctypes.c_char_p,
            ctypes.c_int,
            ctypes.c_int,
            ctypes.c_uint32,
            ctypes.POINTER(ext2_filsys),
        ]
        self.libext2fs.ext2fs_close.argtypes = [ext2_filsys]
        self.libext2fs.ext2fs_get_next_inode.argtypes = [ext2_inode_scan, ext2_inode_large_p]
        self.libext2fs.ext2fs_get_next_inode.restype = ctypes.c_int
    async def get_hardlinks(self, path: Path) -> Generator:
        path = path.resolve().absolute()
        inode = path.stat().st_ino
        fs = ext2_filsys()
        ret = self.libext2fs.ext2fs_open(path.encode(), 0, 0, 0, ctypes.byref(fs))
        if ret != 0:
            return []
        scan = ext2_inode_scan()
        ret = self.libext2fs.ext2fs_open_inode_scan(fs, ctypes.byref(scan))
        if ret != 0:
            self.libext2fs.ext2fs_close(fs)
            return []
        inode_large = ext2_inode_large()
        while self.libext2fs.ext2fs_get_next_inode(scan, ctypes.byref(inode_large)) == 0:
            if inode_large.i_links_count > 1 and inode_large.i_file_acl == inode:
                yield Path(fs.fs_mount_point) / scan.name.decode()
        self.libext2fs.ext2fs_close_inode_scan(scan)
        self.libext2fs.ext2fs_close(fs)
--- a/melamine/filesystems/zfs.py
+++ b/melamine/filesystems/zfs.py
@ -0,0 +1,25 @@
 from collections.abc import Generator
 from pathlib import Path
 import pyzfs
 class ZFSHandler:
    def __init__(self, fs: str) -> None:
        self.fs = "zfs"
    async def get_hardlinks(self, path: Path) -> Generator:
        path = path.resolve().absolute()
        inode = path.stat().st_ino
        zfs = pyzfs.ZFS()
        dataset = zfs.get_dataset_by_path(str(path))
        if dataset is not None:
            pool = dataset.pool
            filesystem = dataset.filesystem
            fs = pool.open(filesystem)
            for snapshot in fs.snapshots():
                for entry in snapshot.ls(str(path)):
                    if entry.inode() == inode:
                        yield Path(entry.path())
--- a/melamine/logs.py
+++ b/melamine/logs.py
@ -0,0 +1,35 @@
 from loguru import logger
 logger.add(
    "app.log",
    format="<level><light-blue>{time:YYYY-MM-DD HH:mm:ss} | {message}</light-blue></level>",
    level="INFO",
    rotation="1 day",
    retention="30 days",
 )
 logger.add(
    "errors.log",
    format="<level><yellow>ℹ️ {time:YYYY-MM-DD HH:mm:ss} | {message}</yellow></level>",
    level="WARNING",
    rotation="1 day",
    retention="30 days",
 )
 logger.add(
    "error.log",
    format="<level><red>⛔️ {time:YYYY-MM-DD HH:mm:ss} | {message}</red></level>",
    level="ERROR",
    rotation="1 day",
    retention="30 days",
 )
 logger.add(
    "critical.log",
    format="<level><magenta>🚨 {time:YYYY-MM-DD HH:mm:ss} | {message}</magenta></level>",
    level="CRITICAL",
    rotation="1 day",
    retention="30 days",
 )
--- a/melamine/shred.py
+++ b/melamine/shred.py
@ -0,0 +1,43 @@
 from .classes import ShredDir
 from .classes import ShredFile
 from .fileops import mount_to_fs
 from .logs import logger
 async def main(job) -> bool:
    """
    This is the main function for processing a shred request.
    It is called by the CLI and builds a job queue based on the arguments passed.
    """
    new_paths = set()
    # Expand all directories and files, and collect mount point information
    for path in job.paths:
        if path.is_file():
            logger.info(f"Adding file: {path}")
            shred_file = ShredFile(path)
            shred_file.fs_handler = await mount_to_fs(shred_file.mount_point)
            new_paths.add(shred_file)
        elif path.is_dir():
            if job.recursive:
                logger.info(f"Adding directory: {path}")
                shred_dir = ShredDir(path)
                shred_dir.fs_handler = await mount_to_fs(shred_dir.mount_point)
                new_paths.add(shred_dir)
            else:
                logger.info(f"Skipping directory: {path} (try -r/--recursive)")
        else:
            raise TypeError(f"Not a file or directory: {path}")
    job.paths = new_paths
    # Get hardlinks to subsequently unlink for all files
    for path in job.paths:
        if isinstance(path, ShredFile):
            path.hardlinks = set(link async for link in path.fs_handler.get_hardlinks(path))
    # Shred all physical files including hardlinks
    for path in job.paths:
        if isinstance(path, ShredFile):
            await path.shred(job.hash, job.dryrun)
        elif isinstance(path, ShredDir):
            await path.shred(job.hash, job.dryrun)
--- a/melamine/validators.py
+++ b/melamine/validators.py
@ -0,0 +1,30 @@
 import os
 import platform
 import sys
 from pathlib import Path
 def validate_file_folder(value: str) -> Path:
    file_folder_path = Path(value)
    if not file_folder_path.exists():
        raise FileNotFoundError(f"No such file or folder: {value}")
    if not file_folder_path.is_file() and not file_folder_path.is_dir():
        raise TypeError(f"Not a file or directory: {value}")
    return file_folder_path
 def validate_logfile(value: str) -> Path:
    logfile_path = Path(value)
    if logfile_path.exists():
        confirm = input(f"The file {value} already exists. Do you want to overwrite it? ([y]es/[n]o): ")
        if confirm.lower() not in ["yes", "y"]:
            sys.exit(f"Exiting. Log file {value} will not be overwritten.")
    return logfile_path
 def validate_environment():
    if platform.system() != "Linux":
        sys.exit("Error: This script must be run on a Linux system.")
    if os.getuid() != 0:
        sys.exit("Error: This script must be run with sudo or root privileges.")
--- a/pyproject.toml
+++ b/pyproject.toml
@ -10,7 +10,14 @@ description = "A comprehensive file shredder for Linux"
 readme = "README.md"
 requires-python = ">=3.9"
 license = { text = "MIT" }
-dependencies = ["fastapi==0.95.2"]
+dependencies = [
    "loguru==0.7.0",
    "aiofiles==23.1.0",
    "uvloop==0.17.0",
    "pyzfs==0.2.3",
    "asyncstdlib==3.10.8",
    "psutil==5.9.5",
 ]
 [project.scripts]
 ghostforge_adduser = "melamine.cli:run"
@ -24,8 +31,6 @@ py-modules = ["melamine"]
 [tool.bandit]
 exclude_dirs = ["/doc", "/build"]
 # TODO: Stop skipping B104 (binding on 0.0.0.0), is there a nice way to get a good docker bind address?
 skips = ["B104"]
 [tool.black]
 line-length = 120