Remove duplicate tool redo

This commit is contained in:
mrfry 2021-03-11 17:22:42 +01:00
parent 87df6638ed
commit 14c9df4a62

View file

@ -1,309 +1,209 @@
/* ----------------------------------------------------------------------------
const minpercent = 97
const resultDbFileName = 'res.json'
Question Server question file merger
GitLab: <https://gitlab.com/MrFry/mrfrys-node-server>
// ---------------------------------------------------------------------------------------------------
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
// TODO: merge 2 dbs
// TODO: filter questions out from a db based on another, producing a new one
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
const params = process.argv.splice(2)
------------------------------------------------------------------------- */
console.log('Params', params)
const utils = require('./utils.js')
const {
searchData,
addQuestion,
getSubjNameWithoutYear,
createQuestion,
} = require('./classes.js')
const actions = require('./actions.js')
const logger = require('./logger.js')
const fileA = params[0]
const fileB = params[1]
const resultFileName = 'res.json'
const minMatchAmmount = 100
const dbA = utils.ReadJSON(fileA)
const dbB = fileB ? utils.ReadJSON(fileB) : null
const logPath = './mergeLogs/mergelog_' + GetDateString().replace(/ /g, '_')
const line =
'===================================================================='
const logPath = './duplicateRemovingLog/'
utils.CreatePath(logPath)
Main()
async function Main() {
const params = GetParams()
console.log(params)
if (params.length === 0) {
console.error('No params! Need a path to a question database!')
process.exit()
}
const data = actions.LoadJSON(params[0])
PrintDB(data)
console.log(hr('='))
const { res, stats } = await RemoveDuplicates(data)
console.log(hr('='))
LogStats(stats, data, res)
console.log(hr('='))
console.log('Result database:')
PrintDB(res)
console.log(hr('='))
utils.WriteFile(JSON.stringify(res), resultFileName)
console.log(C('green') + resultFileName + ' written!' + C())
console.log(hr('='))
console.log(C('green') + 'Done' + C())
}
function LogStats(stats, oldData, newData) {
const maxSubjNameLength = MaxLengthOf(stats, 'name')
const maxPrevLength = MaxLengthOf(stats, 'prevQuestions')
const maxAddedLength = MaxLengthOf(stats, 'addedQuestions')
const maxRemovedLength = MaxLengthOf(stats, 'removedQuestions')
stats.forEach((currStat) => {
const { name, prevQuestions, addedQuestions, removedQuestions } = currStat
let toLog = ''
toLog += C('green')
toLog += GetExactLength(name, maxSubjNameLength)
toLog += C()
toLog += ' '
toLog += C('magenta')
toLog += GetExactLength(prevQuestions, maxPrevLength)
toLog += C()
toLog += C('cyan')
toLog += ' -> '
toLog += C()
toLog += C('green')
toLog += GetExactLength(addedQuestions, maxAddedLength)
toLog += C()
toLog += ' [ '
toLog += C('red')
toLog += GetExactLength(removedQuestions, maxRemovedLength)
toLog += C()
toLog += ' ]'
console.log(toLog)
if (!dbB) {
rmDuplicates(dbA).then((res) => {
utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written')
})
} else {
difference({ dbA: dbA, dbB: dbB }).then((res) => {
utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written')
})
console.log(hr())
console.log('Old data:')
LogDataCount(oldData)
console.log('New data:')
LogDataCount(newData)
}
function LogDataCount(data) {
const subjLength = data.length
const qLength = data.reduce((acc, subj) => {
return acc + subj.Questions.length
}, 0)
async function rmDuplicates(db) {
return await difference({ dbA: db })
}
async function difference({ dbA, dbB }) {
const doingDifference = !!dbB
// Stuff only from A
const resultDb = []
let dbLength = 0
let removedTotal = 0
iterateSubjects(dbA, () => {
dbLength++
})
const getResultDbLength = () => {
let resultDbLength = 0
iterateSubjects(resultDb, () => {
resultDbLength++
})
return resultDbLength
}
for (let i = 0; i < dbA.length; i++) {
const subj = dbA[i]
const subjLogPath = logPath + subj.Name
utils.WriteFile('', subjLogPath)
let removedCount = 0
hr()
console.log(
'Subjects: ' +
C('green') +
subjLength +
C() +
', Questions: ' +
C('green') +
qLength +
C()
`${C('blue')}${i + 1} / ${dbA.length}: ${C('green')}${subj.Name}, ${C(
'blue'
)}${subj.Questions.length}${C('green')} questions${C()}`
)
console.log(
`${C('green')}Result length: ${C('blue')}${getResultDbLength()}${C(
'green'
)}, original length: ${C('blue')}${dbLength}${C()}`
)
}
function PrintDB(data) {
const maxSubjNameLength = MaxLengthOf(data, 'Name')
data.forEach((subj) => {
let toLog = ''
toLog += C('green')
toLog += GetExactLength(subj.Name, maxSubjNameLength)
toLog += C()
toLog += ' [ '
toLog += C('cyan')
toLog += subj.Questions.length
toLog += C()
toLog += ' ]'
console.log(toLog)
})
console.log(hr())
LogDataCount(data)
console.log(hr())
}
function GetExactLength(string, length) {
let toLog = string.toString()
const lengthDiff = length - toLog.length
for (let i = 0; i < lengthDiff; i++) {
toLog += ' '
}
return toLog
}
function MaxLengthOf(prop, key) {
return prop.reduce((acc, currStat) => {
if (acc < currStat[key].toString().length) {
acc = currStat[key].toString().length
}
return acc
}, 0)
}
async function RemoveDuplicates(data) {
console.log(C('yellow') + 'Removing duplicates' + C())
let res = []
const stats = []
for (let i = 0; i < data.length; i++) {
const subj = data[i]
const logFile =
logPath + '/' + subj.Name.replace(/ /g, '_').replace(/\//g, '-')
let addedQuestions = 0
let removedQuestions = 0
for (let j = 0; j < subj.Questions.length; j++) {
const question = subj.Questions[j]
try {
// Searching for same question in result database
const tempQuestion = createQuestion(
question.Q,
question.A,
question.data
const searchRes = await search({
qdb: doingDifference ? dbB : resultDb,
subjName: subj.Name,
question: question,
searchInAllIfNoResult: doingDifference,
})
printProgressBar(j + 1, subj.Questions.length)
const res = hasRequiredPercent(searchRes, minpercent)
// no result: adding to difference
if (res.length === 0) {
// no result: adding to difference
addQuestion(resultDb, subj.Name, question)
} else {
// has result, not adding to difference
utils.AppendToFile(
line +
'\n' +
line +
'\n' +
JSON.stringify(question, null, 2) +
'\n' +
line +
JSON.stringify(res, null, 2) +
'\n',
subjLogPath
)
let result = await searchData(res, tempQuestion)
result = result.reduce((acc, res) => {
if (res.match >= minMatchAmmount) {
removedCount++
removedTotal++
}
}
console.log(
`${C('yellow')}Removed ${C('red')}${removedCount}${C(
'yellow'
)} questions${C()}`
)
}
hr()
// console.log('RESTUL', JSON.stringify(resultDb, null, 2))
console.log(
`Result length: ${getResultDbLength()}, original length: ${dbLength}, removed ${removedTotal} questions`
)
return resultDb
}
function hasRequiredPercent(result, minpercent) {
return result.reduce((acc, res) => {
if (res.match >= minpercent) {
acc.push(res)
}
return acc
}, [])
}
// if htere are more that one same questions in the new database
if (result.length > 0) {
utils.AppendToFile(hr('#'), logFile)
utils.AppendToFile('QUESTION', logFile)
utils.AppendToFile(JSON.stringify(tempQuestion, null, 2), logFile)
utils.AppendToFile(hr(), logFile)
utils.AppendToFile('SAMES', logFile)
utils.AppendToFile(JSON.stringify(result, null, 2), logFile)
removedQuestions++
} else {
// if no same questions are fount then adding it to then new db
addQuestion(res, getSubjNameWithoutYear(subj.Name), tempQuestion)
addedQuestions++
}
LogResultProgress(
subj,
i,
j,
subj.Questions.length,
addedQuestions,
removedQuestions,
data.length
// ---------------------------------------------------------------------------------
// db editing tools
// ---------------------------------------------------------------------------------
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
return new Promise((resolve) => {
resolve(
doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult)
)
} catch (err) {
console.log()
console.log('ERROR')
console.log(err)
console.log('QUESTION')
console.log(question)
console.log()
}
}
stats.push({
name: subj.Name,
prevQuestions: subj.Questions.length,
addedQuestions: addedQuestions,
removedQuestions: removedQuestions,
})
}
return { res, stats }
}
function LogResultProgress(
subj,
i,
j,
length,
addedQuestions,
removedQuestions,
subjCount
) {
process.stdout.write(
'[ ' +
C('cyan') +
(i + 1) +
C() +
' / ' +
C('green') +
subjCount +
C() +
' ] ' +
C('yellow') +
subj.Name +
C() +
': ' +
C('green') +
subj.Questions.length +
' ' +
C('cyan') +
'-> ' +
C('green') +
addedQuestions +
C() +
', removed: ' +
C('red') +
removedQuestions +
C()
)
function iterateSubjects(db, fn) {
db.forEach((subj) => {
subj.Questions.forEach((question) => {
fn(subj, question)
})
})
}
if (j === length - 1) {
process.stdout.write('\n')
} else {
// ---------------------------------------------------------------------------------
// logging and tools
// ---------------------------------------------------------------------------------
function hr() {
let res = ''
for (let i = 0; i < process.stdout.columns; i++) {
res += '='
}
console.log(`${C('cyan')}${res}${C()}`)
}
function writeInSameLine(text, returnToLineStart) {
process.stdout.write(text)
if (returnToLineStart) {
process.stdout.write('\r')
} else {
process.stdout.write('\n')
}
}
function hr(char) {
let hr = ''
const cols = process.stdout.columns || 20
for (let i = 0; i < cols; i++) {
hr += char || '-'
function printProgressBar(current, total) {
const width = process.stdout.columns - 25
if (width <= 0) {
return
}
return hr
const x = width / total
const xCurrent = Math.floor(current * x)
const xTotal = Math.floor(total * x)
let line = ''
for (let i = 0; i < xCurrent; i++) {
line += '='
}
for (let i = 0; i < xTotal - xCurrent; i++) {
line += ' '
}
const numbers = `${current} / ${total}`
writeInSameLine(
`${C('magenta')} [${line}]${C('green')} ${numbers}${C()}`,
current !== total
)
}
function C(color) {
return logger.C(color)
}
function GetParams() {
return process.argv.splice(2)
}
function GetDateString() {
const date = new Date()
const dateString =
date.getFullYear() +
'-' +
('0' + (date.getMonth() + 1)).slice(-2) +
'-' +
('0' + date.getDate()).slice(-2) +
' ' +
('0' + date.getHours()).slice(-2) +
':' +
('0' + date.getMinutes()).slice(-2) +
':' +
('0' + date.getSeconds()).slice(-2)
return dateString
}