Remove duplicate tool redo

This commit is contained in:
mrfry 2021-03-11 17:22:42 +01:00
parent 87df6638ed
commit 14c9df4a62

View file

@ -1,309 +1,209 @@
/* ---------------------------------------------------------------------------- const minpercent = 97
const resultDbFileName = 'res.json'
Question Server question file merger // ---------------------------------------------------------------------------------------------------
GitLab: <https://gitlab.com/MrFry/mrfrys-node-server>
This program is free software: you can redistribute it and/or modify const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
it under the terms of the GNU General Public License as published by const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
the Free Software Foundation, either version 3 of the License, or const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
(at your option) any later version.
This program is distributed in the hope that it will be useful, // TODO: merge 2 dbs
but WITHOUT ANY WARRANTY; without even the implied warranty of // TODO: filter questions out from a db based on another, producing a new one
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License const params = process.argv.splice(2)
along with this program. If not, see <https://www.gnu.org/licenses/>.
------------------------------------------------------------------------- */ console.log('Params', params)
const utils = require('./utils.js') const fileA = params[0]
const { const fileB = params[1]
searchData,
addQuestion,
getSubjNameWithoutYear,
createQuestion,
} = require('./classes.js')
const actions = require('./actions.js')
const logger = require('./logger.js')
const resultFileName = 'res.json' const dbA = utils.ReadJSON(fileA)
const minMatchAmmount = 100 const dbB = fileB ? utils.ReadJSON(fileB) : null
const logPath = './mergeLogs/mergelog_' + GetDateString().replace(/ /g, '_') const line =
'===================================================================='
const logPath = './duplicateRemovingLog/'
utils.CreatePath(logPath)
Main() if (!dbB) {
rmDuplicates(dbA).then((res) => {
async function Main() { utils.WriteFile(JSON.stringify(res), resultDbFileName)
const params = GetParams() console.log('File written')
console.log(params)
if (params.length === 0) {
console.error('No params! Need a path to a question database!')
process.exit()
}
const data = actions.LoadJSON(params[0])
PrintDB(data)
console.log(hr('='))
const { res, stats } = await RemoveDuplicates(data)
console.log(hr('='))
LogStats(stats, data, res)
console.log(hr('='))
console.log('Result database:')
PrintDB(res)
console.log(hr('='))
utils.WriteFile(JSON.stringify(res), resultFileName)
console.log(C('green') + resultFileName + ' written!' + C())
console.log(hr('='))
console.log(C('green') + 'Done' + C())
}
function LogStats(stats, oldData, newData) {
const maxSubjNameLength = MaxLengthOf(stats, 'name')
const maxPrevLength = MaxLengthOf(stats, 'prevQuestions')
const maxAddedLength = MaxLengthOf(stats, 'addedQuestions')
const maxRemovedLength = MaxLengthOf(stats, 'removedQuestions')
stats.forEach((currStat) => {
const { name, prevQuestions, addedQuestions, removedQuestions } = currStat
let toLog = ''
toLog += C('green')
toLog += GetExactLength(name, maxSubjNameLength)
toLog += C()
toLog += ' '
toLog += C('magenta')
toLog += GetExactLength(prevQuestions, maxPrevLength)
toLog += C()
toLog += C('cyan')
toLog += ' -> '
toLog += C()
toLog += C('green')
toLog += GetExactLength(addedQuestions, maxAddedLength)
toLog += C()
toLog += ' [ '
toLog += C('red')
toLog += GetExactLength(removedQuestions, maxRemovedLength)
toLog += C()
toLog += ' ]'
console.log(toLog)
}) })
console.log(hr()) } else {
console.log('Old data:') difference({ dbA: dbA, dbB: dbB }).then((res) => {
LogDataCount(oldData) utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('New data:') console.log('File written')
LogDataCount(newData)
}
function LogDataCount(data) {
const subjLength = data.length
const qLength = data.reduce((acc, subj) => {
return acc + subj.Questions.length
}, 0)
console.log(
'Subjects: ' +
C('green') +
subjLength +
C() +
', Questions: ' +
C('green') +
qLength +
C()
)
}
function PrintDB(data) {
const maxSubjNameLength = MaxLengthOf(data, 'Name')
data.forEach((subj) => {
let toLog = ''
toLog += C('green')
toLog += GetExactLength(subj.Name, maxSubjNameLength)
toLog += C()
toLog += ' [ '
toLog += C('cyan')
toLog += subj.Questions.length
toLog += C()
toLog += ' ]'
console.log(toLog)
}) })
console.log(hr())
LogDataCount(data)
console.log(hr())
} }
function GetExactLength(string, length) { async function rmDuplicates(db) {
let toLog = string.toString() return await difference({ dbA: db })
const lengthDiff = length - toLog.length }
for (let i = 0; i < lengthDiff; i++) {
toLog += ' ' async function difference({ dbA, dbB }) {
const doingDifference = !!dbB
// Stuff only from A
const resultDb = []
let dbLength = 0
let removedTotal = 0
iterateSubjects(dbA, () => {
dbLength++
})
const getResultDbLength = () => {
let resultDbLength = 0
iterateSubjects(resultDb, () => {
resultDbLength++
})
return resultDbLength
} }
return toLog for (let i = 0; i < dbA.length; i++) {
} const subj = dbA[i]
const subjLogPath = logPath + subj.Name
utils.WriteFile('', subjLogPath)
let removedCount = 0
function MaxLengthOf(prop, key) { hr()
return prop.reduce((acc, currStat) => { console.log(
if (acc < currStat[key].toString().length) { `${C('blue')}${i + 1} / ${dbA.length}: ${C('green')}${subj.Name}, ${C(
acc = currStat[key].toString().length 'blue'
} )}${subj.Questions.length}${C('green')} questions${C()}`
return acc )
}, 0) console.log(
} `${C('green')}Result length: ${C('blue')}${getResultDbLength()}${C(
'green'
)}, original length: ${C('blue')}${dbLength}${C()}`
)
async function RemoveDuplicates(data) {
console.log(C('yellow') + 'Removing duplicates' + C())
let res = []
const stats = []
for (let i = 0; i < data.length; i++) {
const subj = data[i]
const logFile =
logPath + '/' + subj.Name.replace(/ /g, '_').replace(/\//g, '-')
let addedQuestions = 0
let removedQuestions = 0
for (let j = 0; j < subj.Questions.length; j++) { for (let j = 0; j < subj.Questions.length; j++) {
const question = subj.Questions[j] const question = subj.Questions[j]
try { const searchRes = await search({
// Searching for same question in result database qdb: doingDifference ? dbB : resultDb,
const tempQuestion = createQuestion( subjName: subj.Name,
question.Q, question: question,
question.A, searchInAllIfNoResult: doingDifference,
question.data })
)
let result = await searchData(res, tempQuestion)
result = result.reduce((acc, res) => {
if (res.match >= minMatchAmmount) {
acc.push(res)
}
return acc
}, [])
// if htere are more that one same questions in the new database printProgressBar(j + 1, subj.Questions.length)
if (result.length > 0) {
utils.AppendToFile(hr('#'), logFile) const res = hasRequiredPercent(searchRes, minpercent)
utils.AppendToFile('QUESTION', logFile)
utils.AppendToFile(JSON.stringify(tempQuestion, null, 2), logFile) // no result: adding to difference
utils.AppendToFile(hr(), logFile) if (res.length === 0) {
utils.AppendToFile('SAMES', logFile) // no result: adding to difference
utils.AppendToFile(JSON.stringify(result, null, 2), logFile) addQuestion(resultDb, subj.Name, question)
removedQuestions++ } else {
} else { // has result, not adding to difference
// if no same questions are fount then adding it to then new db utils.AppendToFile(
addQuestion(res, getSubjNameWithoutYear(subj.Name), tempQuestion) line +
addedQuestions++ '\n' +
} line +
LogResultProgress( '\n' +
subj, JSON.stringify(question, null, 2) +
i, '\n' +
j, line +
subj.Questions.length, JSON.stringify(res, null, 2) +
addedQuestions, '\n',
removedQuestions, subjLogPath
data.length
) )
} catch (err) { removedCount++
console.log() removedTotal++
console.log('ERROR')
console.log(err)
console.log('QUESTION')
console.log(question)
console.log()
} }
} }
stats.push({ console.log(
name: subj.Name, `${C('yellow')}Removed ${C('red')}${removedCount}${C(
prevQuestions: subj.Questions.length, 'yellow'
addedQuestions: addedQuestions, )} questions${C()}`
removedQuestions: removedQuestions, )
})
} }
return { res, stats }
}
function LogResultProgress( hr()
subj, // console.log('RESTUL', JSON.stringify(resultDb, null, 2))
i, console.log(
j, `Result length: ${getResultDbLength()}, original length: ${dbLength}, removed ${removedTotal} questions`
length,
addedQuestions,
removedQuestions,
subjCount
) {
process.stdout.write(
'[ ' +
C('cyan') +
(i + 1) +
C() +
' / ' +
C('green') +
subjCount +
C() +
' ] ' +
C('yellow') +
subj.Name +
C() +
': ' +
C('green') +
subj.Questions.length +
' ' +
C('cyan') +
'-> ' +
C('green') +
addedQuestions +
C() +
', removed: ' +
C('red') +
removedQuestions +
C()
) )
return resultDb
}
if (j === length - 1) { function hasRequiredPercent(result, minpercent) {
process.stdout.write('\n') return result.reduce((acc, res) => {
} else { if (res.match >= minpercent) {
acc.push(res)
}
return acc
}, [])
}
// ---------------------------------------------------------------------------------
// db editing tools
// ---------------------------------------------------------------------------------
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
return new Promise((resolve) => {
resolve(
doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult)
)
})
}
function iterateSubjects(db, fn) {
db.forEach((subj) => {
subj.Questions.forEach((question) => {
fn(subj, question)
})
})
}
// ---------------------------------------------------------------------------------
// logging and tools
// ---------------------------------------------------------------------------------
function hr() {
let res = ''
for (let i = 0; i < process.stdout.columns; i++) {
res += '='
}
console.log(`${C('cyan')}${res}${C()}`)
}
function writeInSameLine(text, returnToLineStart) {
process.stdout.write(text)
if (returnToLineStart) {
process.stdout.write('\r') process.stdout.write('\r')
} else {
process.stdout.write('\n')
} }
} }
function hr(char) { function printProgressBar(current, total) {
let hr = '' const width = process.stdout.columns - 25
const cols = process.stdout.columns || 20
for (let i = 0; i < cols; i++) { if (width <= 0) {
hr += char || '-' return
} }
return hr
const x = width / total
const xCurrent = Math.floor(current * x)
const xTotal = Math.floor(total * x)
let line = ''
for (let i = 0; i < xCurrent; i++) {
line += '='
}
for (let i = 0; i < xTotal - xCurrent; i++) {
line += ' '
}
const numbers = `${current} / ${total}`
writeInSameLine(
`${C('magenta')} [${line}]${C('green')} ${numbers}${C()}`,
current !== total
)
} }
function C(color) { function C(color) {
return logger.C(color) return logger.C(color)
} }
function GetParams() {
return process.argv.splice(2)
}
function GetDateString() {
const date = new Date()
const dateString =
date.getFullYear() +
'-' +
('0' + (date.getMonth() + 1)).slice(-2) +
'-' +
('0' + date.getDate()).slice(-2) +
' ' +
('0' + date.getHours()).slice(-2) +
':' +
('0' + date.getMinutes()).slice(-2) +
':' +
('0' + date.getSeconds()).slice(-2)
return dateString
}