This shows you the differences between two versions of the page.
find-duplicates [2019-06-21 14:14] |
find-duplicates [2020-02-15 00:57] (current) |
||
---|---|---|---|
Line 1: | Line 1: | ||
+ | =====Find media duplicates===== | ||
+ | |||
+ | Use MongoDB with unique index by hash of head 2mb of file. | ||
+ | |||
+ | <code javascript fd.js> | ||
+ | 'use strict' | ||
+ | |||
+ | const fs = require('fs') | ||
+ | const path = require('path') | ||
+ | const util = require('util') | ||
+ | const readChunk = require('read-chunk') | ||
+ | const crypto = require('crypto') | ||
+ | |||
+ | const readdir = util.promisify(fs.readdir) | ||
+ | const stat = util.promisify(fs.stat) | ||
+ | |||
+ | const mongo = require('mongodb') | ||
+ | const url = 'mongodb://localhost:27017' | ||
+ | const client = new mongo.MongoClient(url, { | ||
+ | useNewUrlParser: true | ||
+ | }) | ||
+ | |||
+ | async function walk(dir, collection) { | ||
+ | var list = await readdir(dir) | ||
+ | for (item of list) { | ||
+ | var file = dir + '/' + item | ||
+ | var stats = await stat(file) | ||
+ | |||
+ | console.log('file: ', file) | ||
+ | try { | ||
+ | if (stats.isFile()) { | ||
+ | const buffer = readChunk.sync(file, 0, 2 * 1024 * 1024) | ||
+ | const hex1 = crypto.createHmac('md5', buffer).digest('hex') | ||
+ | const hex2 = crypto.createHmac('md5', `${stats.size}`).digest('hex') | ||
+ | const hex = hex1 + hex2 | ||
+ | try { | ||
+ | var res = await collection.insertOne({ | ||
+ | name: file, | ||
+ | hex: hex | ||
+ | }) | ||
+ | } catch (err) { | ||
+ | const dubl = await collection.find({ | ||
+ | hex: hex | ||
+ | }).toArray() | ||
+ | console.log('\ndublicate: ', file) | ||
+ | console.log(' : ', dubl[0].name) | ||
+ | } | ||
+ | } | ||
+ | if (stats.isDirectory()) { | ||
+ | await walk(file, collection) | ||
+ | } | ||
+ | } catch (err) { console.log(err)} | ||
+ | } | ||
+ | } | ||
+ | |||
+ | async function main() { | ||
+ | const connect = await client.connect() | ||
+ | const db = await client.db('files') | ||
+ | |||
+ | const collection = db.collection('files') | ||
+ | const res = await collection.deleteMany({}) | ||
+ | await collection.createIndex({ hex: 1 }, { unique: true }) | ||
+ | await walk('/media/photo', collection) | ||
+ | await client.close(); | ||
+ | } | ||
+ | |||
+ | main() | ||
+ | </code> | ||
+ | |||
+ | |||
+ | ---- | ||
+ | [<>] | ||