User Tools

Site Tools


Find media duplicates

Use MongoDB with unique index by hash of head 2mb of file.

fd.js
'use strict'
 
const fs = require('fs')
const path = require('path')
const util = require('util')
const readChunk = require('read-chunk')
const crypto = require('crypto')
 
const readdir = util.promisify(fs.readdir)
const stat = util.promisify(fs.stat)
 
const mongo = require('mongodb')
const url = 'mongodb://localhost:27017'
const client = new mongo.MongoClient(url, {
    useNewUrlParser: true
})
 
async function walk(dir, collection) {
    var list = await readdir(dir)
    for (item of list) {
        var file = dir + '/' + item
        var stats = await stat(file)
 
        console.log('file: ', file)
        try {
            if (stats.isFile()) {
                const buffer = readChunk.sync(file, 0, 2 * 1024 * 1024)
                const hex1 = crypto.createHmac('md5', buffer).digest('hex')
                const hex2 = crypto.createHmac('md5', `${stats.size}`).digest('hex')
                const hex = hex1 + hex2
                try {
                    var res = await collection.insertOne({
                        name: file,
                        hex: hex
                    })
                } catch (err) {
                    const dubl = await collection.find({
                        hex: hex
                    }).toArray()
                    console.log('\ndublicate: ', file)
                    console.log('         : ', dubl[0].name)
                }
            }
            if (stats.isDirectory()) {
                await walk(file, collection)
            }
        } catch (err) { console.log(err)}
    }
}
 
async function main() {
    const connect = await client.connect()
    const db = await client.db('files')
 
    const collection = db.collection('files')
    const res = await collection.deleteMany({})
    await collection.createIndex({ hex: 1 }, { unique: true })
    await walk('/media/photo', collection)
    await client.close();
}
 
main()

First PagePrevious PageBack to overviewNext PageLast Page