User Tools

Site Tools


Differences

This shows you the differences between two versions of the page.

Link to this comparison view

find-duplicates [2019-02-01 15:44] (current)
ziggi created
Line 1: Line 1:
 +=====Find media duplicates=====
 +
 +Use MongoDB with unique index by hash of head 2mb of file.
 + 
 +<code javascript fd.js>
 +'use strict'​
 +
 +const fs = require('​fs'​)
 +const path = require('​path'​)
 +const util = require('​util'​)
 +const readChunk = require('​read-chunk'​)
 +const crypto = require('​crypto'​)
 +
 +const readdir = util.promisify(fs.readdir)
 +const stat = util.promisify(fs.stat)
 +
 +const mongo = require('​mongodb'​)
 +const url = '​mongodb://​localhost:​27017'​
 +const client = new mongo.MongoClient(url,​ {
 +    useNewUrlParser:​ true
 +})
 +
 +async function walk(dir, collection) {
 +    var list = await readdir(dir)
 +    for (item of list) {
 +        var file = dir + '/'​ + item
 +        var stats = await stat(file)
 +
 +        console.log('​file:​ ', file)
 +        try {
 +            if (stats.isFile()) {
 +                const buffer = readChunk.sync(file,​ 0, 2 * 1024 * 1024)
 +                const hex1 = crypto.createHmac('​md5',​ buffer).digest('​hex'​)
 +                const hex2 = crypto.createHmac('​md5',​ `${stats.size}`).digest('​hex'​)
 +                const hex = hex1 + hex2
 +                try {
 +                    var res = await collection.insertOne({
 +                        name: file,
 +                        hex: hex
 +                    })
 +                } catch (err) {
 +                    const dubl = await collection.find({
 +                        hex: hex
 +                    }).toArray()
 +                    console.log('​\ndublicate:​ ', file)
 +                    console.log(' ​        : ', dubl[0].name)
 +                }
 +            }
 +            if (stats.isDirectory()) {
 +                await walk(file, collection)
 +            }
 +        } catch (err) { console.log(err)}
 +    }
 +}
 +
 +async function main() {
 +    const connect = await client.connect()
 +    const db = await client.db('​files'​)
 +
 +    const collection = db.collection('​files'​)
 +    const res = await collection.deleteMany({})
 +    await collection.createIndex({ hex: 1 }, { unique: true })
 +    await walk('/​media/​photo',​ collection)
 +    await client.close();​
 +}
 +
 +main()
 +</​code>​
 +
 +
 +----
 +[<>]