首页 > 解决方案 > 计算多个文件的 MD5 哈希

问题描述

我的目标是用它们的 MD5 哈希重命名目录中的所有文件,以便更容易检查重复项。

我目前有大约 30,000 个文件要处理,但是,在使用小批量文件进行测试以确保代码正常工作后,我遇到了这个错误

错误:EMFILE:打开的文件太多...

是的,我试图研究这个问题和其他多个类似问题。节点和错误:EMFILE,打开的文件太多

我认为这与我如何打开文件以及与异步操作有关,但是,我不知道如何以这种方式正确编码。

这是我解决这个问题的最初尝试。

const md5hashtable = [];
async function processFilesMD5(routePath) {
    // Get files/folders in path
    await fs.readdirSync(routePath).forEach((file) => {
        const filepath = path.join(routePath, file);
        // Check if folder is dir to do tree walk
        fs.stat(filepath, async (err, stat) => {
            if (stat.isDirectory()) {
                await processFilesMD5(filepath);
            // Calculate md5 of file
            } else {
                let filename = path.basename(filepath).replace(path.extname(filepath), "")
                if (RegExp('^[a-f0-9]{32}$', 'gm').test(filename)){
                    if (md5hashtable.includes(filename)){
                        console.log(`\nFound dup: ${filename} loc: ${filepath}\n`)
                        fs.unlinkSync(filepath)
                    } else {
                        if (!(path.basename(filepath) === `${filename}${path.extname(filepath)}`)){
                            fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${filename}${path.extname(filepath)}`)
                        }
                        md5hashtable.push(filename)
                    }
                    countProcess++;
                } else {
                    countProcess++;
                    countOpen++;
                    let hash = crypto.createHash('md5')
                    let stream = fs.createReadStream(filepath)
                    console.log(`Created Stream with ID: ${countOpen}`)
                    await stream.on('data', function (data) {
                        hash.update(data, 'utf8')
                        countRead++;
                        // console.log(`Reading Stream with chunk ID: ${countRead}`)
                    })
                    await stream.on('end', function () {
                        countClose++;
                        // console.log(`Closing Steam with ID: ${countClose}`)
                        const md5name = hash.digest('hex')
                        if (md5hashtable.includes(md5name)){
                            console.log(`\nFound dup: ${md5name} loc: ${filepath}\n`)
                            fs.unlinkSync(filepath)
                        } else {
                            if (!(path.basename(filepath) === `${md5name}${path.extname(filepath)}`)){
                                fs.renameSync(filepath, `${filepath.replace(path.basename(filepath), "")}${md5name}${path.extname(filepath)}`)
                            }
                            md5hashtable.push(md5name)
                        }
                        console.log(`File: ${filepath} has hash: ${md5name}`)
                        stream.destroy()
                    })
                }
            }
        });
    });
    console.log(`Current Route: ${routePath}\nTotal files processed: ${countProcess}\nFiles Opened: ${countOpen}\nChunks Read: ${countRead}\nFiles Closed: ${countClose}`)
}

processFilesMD5(`${path.join(__dirname, 'media')}`).then(() => {
    console.log('Done')
})

这是我第二次尝试解决这个问题,为了简单起见,我也对其进行了清理。

const md5hashtable = [];

function calculateMD5(filepath) {
    let hash = crypto.createHash('md5')
    let stream = fs.createReadStream(filepath)
    console.log(`Created Stream`)

    stream.on('data', function (data) {
        hash.update(data, 'utf8')
        console.log(`Reading Stream`)
    })

    stream.on('end', function () {
        const MD5hash = hash.digest('hex')
        if (dupHashCheck(MD5hash)){ // Hash already exsit
            console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
            fs.unlink(filepath) // Deletes duplicate
        } else { // Hash doest not exsit
            md5hashtable.push(md5name)
        }
        console.log(`File: ${filepath}\nHash: ${md5name}\n`)
        stream.destroy()
        console.log(`Closing Steam`)
    })
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filepath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filepath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursivley
                    processImageRoute(filepath);
                        } else { // Continue
                            let filename = path.basename(filepath).replace(path.extname(filepath), "") // Get filename without extension
                            if (validateMD5(filename)){ // Filename is a valid md5 hash
                                if (dupHashCheck(filename)){ // Hash already exsit
                                    console.log(`\nFound dup: ${filename} loc: ${filepath}\n`) 
                                    fs.unlink(filepath) // Deletes duplicate
                                } else { // Hash doest not exsit
                                    md5hashtable.push(filename)
                                }
                            } else { // Isnt a valid md5 hash
                                calculateMD5(filepath)
                            }
                        }
                    })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

这两个代码都不起作用,因为它们打开了太多文件,但是在小批量时,它们可以完美地工作。此外,这是我的第一个问题,我愿意接受任何建议和评论。

标签: node.jsexpressasynchronousmd5

解决方案


遵循@codeness93 所说的话promisifying the code;我做的

global.fs = require('fs-extra');

const md5hashtable = [];

function calculateMD5(filePath) {
    return new Promise((resolve, reject) => {
        let hash = crypto.createHash('md5')
        let stream = fs.createReadStream(filePath)

        stream.on('error', function (err) {
            reject(err);
        })

        stream.on('data', function (data) {
            hash.update(data, 'utf8')
        })

        stream.on('end', function () {
            stream.close();
            resolve(hash.digest('hex'));
        })
    });
}

function validateMD5(hash){
    return RegExp('^[a-f0-9]{32}$', 'gm').test(hash);
}

function dupHashCheck(hash){
    return md5hashtable.includes(hash)
}

function renameFile(filePath, fileHash){
    try {
        fs.renameSync(filePath, `${filePath.replace(path.basename(filePath), "")}${fileHash}${path.extname(filePath)}`)
    } catch (e){
        throw new Error(e)
    }
}

function processImageRoute(routePath) {
    fs.readdir(routePath, (err, files) => { // Get files in path
        files.forEach(file => {
            let filePath = path.join(routePath, file); // Join root dir with path of folder
            fs.stat(filePath, async (err, stat) => { // Get stats of dir
                if (stat.isDirectory()) { // If dir is folder, run recursively
                    processImageRoute(filePath);
                } else { // Continue
                    let fileName = path.basename(filePath).replace(path.extname(filePath), "") // Get fileName without extension
                    if (validateMD5(fileName)){ // fileName is a valid md5 hash
                        if (dupHashCheck(fileName)){ // Hash already exist
                            console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                            fs.unlink(filePath) // Deletes duplicate
                            } else { // Hash doest not exist
                                md5hashtable.push(fileName)
                            }
                    } else { // Isn't a valid md5 hash
                        await calculateMD5(filePath).then(function(fileHash){
                            if (validateMD5(fileHash)){
                                if (dupHashCheck(fileHash)){ // Hash already exist
                                    console.log(`\nFound dup: ${fileName} loc: ${filePath}\n`)
                                    fs.unlink(filePath) // Deletes duplicate
                                } else { // Hash doest not exist
                                    renameFile(filePath, fileHash); // Renames the file to its hash plus extension
                                    md5hashtable.push(fileHash)
                                }
                                console.log(`File: ${filePath}\nHash: ${fileHash}\n`)
                            } else {
                                throw new Error(`Unable to calculate hash for file: ${fileName}\nError: ${fileHash}\n`)
                            }
                        })
                    }
                }
            })
        })
    })
}

processImageRoute(`${path.join(__dirname, 'media')}`)

我不确定它是否添加了承诺,从而增加了打开流与读取它们并随后关闭它们的延迟,或者替换为fs使其fs-extra工作,或两者兼而有之,或魔法尘埃,但它有效。

最后它能够处理所有29088 Files总计400 GBs,所以我称之为成功。随意使用,或留下建议。


推荐阅读