#!/bin/python3

import hashlib
from os import listdir, rmdir
from os.path import isfile, join, exists, getsize, islink
import sys
from typing import Dict
from multiprocessing import cpu_count, Pool

READ_BYTE_COUNT = 1000000 # 1MB

def hashFile(filePath: str) -> str:
    fileToHash = open(filePath, "rb")
    h = hashlib.new('sha256')
    
    bytesRead = fileToHash.read(READ_BYTE_COUNT)
    while(len(bytesRead) > 0):
        h.update(bytesRead)
        bytesRead = fileToHash.read(READ_BYTE_COUNT)
        
    return h.hexdigest()

class FileInfo:
    """ A class to store info about a file"""
    filePath: str = None
    size: int = None
    sha256Hash: str = None

    def __init__(self, filePath: str):
        self.filePath = filePath

    def getHash(self) -> str:
        if self.sha256Hash == None:
            self.sha256Hash = hashFile(self.filePath)
        return self.sha256Hash

    def getPath(self) -> str:
        return self.filePath

    def setPath(self, path: str):
        self.filePath = path

    def getSize(self) -> int:
        if self.size == None:
            self.size = getsize(self.filePath)
        return self.size


def getFilesInDirectory(dirPath: str) -> list[FileInfo]:
    files = []
    try:
        dirContents = listdir(dirPath)
    except:
        print("Error: couldn't list directory contents of " + dirPath)
        sys.exit(-1)
        return []
    for f in dirContents:
        if (exists(join(dirPath, f)) and isfile(join(dirPath, f))):
            files.append(FileInfo(f)) 
    if(dirPath[-1] == "/"):
        for file in files:
            file.setPath(dirPath + file.getPath())
    else:
        for file in files:
            file.setPath(dirPath + "/" + file.getPath())
    return files

def getDirsInDirectory(dirPath: str) -> list[str]:
    dirs = []
    try:
        dirContents = listdir(dirPath)
    except:
        print("Error: couldn't list directory contents of " + dirPath)
        sys.exit(-1)
        return []
    for f in dirContents:
        if (not islink(join(dirPath, f))) and (not isfile(join(dirPath, f))):
            dirs.append(f)
    for index in range(0, len(dirs)):
        if(dirPath[-1] == "/"):
            dirs[index] = dirPath + dirs[index]
        else:
            dirs[index] = dirPath + "/" + dirs[index]
    return dirs

def findAllFilesRecursively(topLevelDir: str) -> list[FileInfo]:
    paths = getFilesInDirectory(topLevelDir)
    for dir in getDirsInDirectory(topLevelDir):
        paths += findAllFilesRecursively(dir)
    return paths

def findAllDirsRecursively(topLevelDir: str) -> list[str]:
    paths = getDirsInDirectory(topLevelDir)
    for dir in getDirsInDirectory(topLevelDir):
        paths += findAllDirsRecursively(dir)
    return paths
    
def findMatches(fileInfos: list[FileInfo], fileInfo: FileInfo) -> list[FileInfo]:
    matches: list[FileInfo] = []
    for fi in fileInfos:
        if fi.getPath() == fileInfo.getPath():
            continue
        elif fi.getHash() == fileInfo.getHash():
            matches.append(fi)
    return matches

def getFilesGroupedBySize(fileInfos: list[FileInfo]) -> Dict[int, list[FileInfo]]:
    fileSizeGroups: Dict[int, list[FileInfo]] = {}
    for fileInfo in fileInfos:
        size = fileInfo.getSize()
        sizeGroup = fileSizeGroups.get(size)
        if sizeGroup == None:
            fileSizeGroups[size] = []
        fileSizeGroups[size].append(fileInfo)
    return fileSizeGroups

def findIdenticalFiles(fileInfos: list[FileInfo]) -> list[list[FileInfo, FileInfo]]:
    dupList = []
    sizeDict = getFilesGroupedBySize(fileInfos)
    for key in sizeDict:
        for idx, fi in enumerate(sizeDict[key]):
            for match in findMatches(sizeDict[key][idx:], fi):
                dupList.append([fi, match])
    return dupList

def findLargestFile(duplicateFileList: list[list[FileInfo, FileInfo]]) -> int:
    largestSize = 0
    largestFileIndex = 0
    for idx, fiPair in enumerate(duplicateFileList):
        if(fiPair[0].getSize() > largestSize):
            largestSize = fiPair[0].getSize()
            largestFileIndex = idx
    return largestFileIndex

def sortByFileSize(duplicateFileList: list[list[str, str, int]]) -> list[list[str, str, int]]:
    sortedDuplicateFileList = []
    print()
    while(len(duplicateFileList) > 0):
        sys.stdout.buffer.write(b'\x1b[2K')
        sys.stdout.flush()
        print(str(len(duplicateFileList)), end='\r', flush=True)
        nextLargestFileIndex = findLargestFile(duplicateFileList)
        sortedDuplicateFileList.append(duplicateFileList[nextLargestFileIndex])
        duplicateFileList.pop(nextLargestFileIndex)
    print()
    return sortedDuplicateFileList

def sortByFileSizeAndGetBiggest(duplicateFileList: list[list[FileInfo, FileInfo]], numToGet: int) -> list[list[FileInfo, FileInfo]]:
    sortedDuplicateFileList = []
    if numToGet > len(duplicateFileList):
        numToGet = len(duplicateFileList)
    for i in range(0, numToGet):
        nextLargestFileIndex = findLargestFile(duplicateFileList)
        sortedDuplicateFileList.append(duplicateFileList[nextLargestFileIndex])
        duplicateFileList.pop(nextLargestFileIndex)
    return sortedDuplicateFileList

def hasFile(fileInfos1: list[FileInfo], file: FileInfo) -> bool:
    for fi in fileInfos1:
        if fi.getHash() == file.getHash():
            return True
    return False 

def findFilesUniqueToListOne(fileInfos1: list[FileInfo], fileInfos2: list[FileInfo]) -> list[FileInfo]:
    missingFiles = []
    for index, fi in enumerate(fileInfos1):
        if not hasFile(fileInfos2, fi):
            missingFiles.append(fi)
    return missingFiles

def getFileCountBySize(fileInfos: list[FileInfo]) -> Dict[int, int]:
    fileSizeCounts: Dict[int, int] = {}
    for fileInfo in fileInfos:
        size = fileInfo.getSize()
        currentCount = fileSizeCounts.get(size)
        if currentCount == None:
            fileSizeCounts[size] = 1
        else:
            fileSizeCounts[size] = fileSizeCounts[size] + 1
    return fileSizeCounts

def getFilesWithSizeMatches(fileInfos: list[FileInfo]) -> list[FileInfo]:
    filesWithSizeMatches: list[FileInfo] = []
    fileSizeCounts = getFileCountBySize(fileInfos)
    for fileInfo in fileInfos:
        if fileSizeCounts[fileInfo.getSize()] > 1:
            filesWithSizeMatches.append(fileInfo)
    return filesWithSizeMatches

def getFileCountByHash(fileInfos: list[FileInfo]) -> Dict[str, int]:
    fileHashCounts: Dict[str, int] = {}
    for fileInfo in fileInfos:
        sha256 = fileInfo.getHash()
        currentCount = fileHashCounts.get(sha256)
        if currentCount == None:
            fileHashCounts[sha256] = 1
        else:
            fileHashCounts[sha256] = fileHashCounts[sha256] + 1
    return fileHashCounts

def hashFileInfo(fileInfo):
    fileInfo.getHash()


def computeHashesInParallel(fileInfos: list[FileInfo]):
    pool = Pool(cpu_count())
    pool.map(hashFileInfo, fileInfos)
