mirror of
https://gitlab.com/MrFry/mrfrys-node-server
synced 2025-04-01 20:24:18 +02:00
Remove duplicate tool redo
This commit is contained in:
parent
87df6638ed
commit
14c9df4a62
1 changed files with 172 additions and 272 deletions
|
@ -1,309 +1,209 @@
|
|||
/* ----------------------------------------------------------------------------
|
||||
const minpercent = 97
|
||||
const resultDbFileName = 'res.json'
|
||||
|
||||
Question Server question file merger
|
||||
GitLab: <https://gitlab.com/MrFry/mrfrys-node-server>
|
||||
// ---------------------------------------------------------------------------------------------------
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
|
||||
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
|
||||
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
// TODO: merge 2 dbs
|
||||
// TODO: filter questions out from a db based on another, producing a new one
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
const params = process.argv.splice(2)
|
||||
|
||||
------------------------------------------------------------------------- */
|
||||
console.log('Params', params)
|
||||
|
||||
const utils = require('./utils.js')
|
||||
const {
|
||||
searchData,
|
||||
addQuestion,
|
||||
getSubjNameWithoutYear,
|
||||
createQuestion,
|
||||
} = require('./classes.js')
|
||||
const actions = require('./actions.js')
|
||||
const logger = require('./logger.js')
|
||||
const fileA = params[0]
|
||||
const fileB = params[1]
|
||||
|
||||
const resultFileName = 'res.json'
|
||||
const minMatchAmmount = 100
|
||||
const dbA = utils.ReadJSON(fileA)
|
||||
const dbB = fileB ? utils.ReadJSON(fileB) : null
|
||||
|
||||
const logPath = './mergeLogs/mergelog_' + GetDateString().replace(/ /g, '_')
|
||||
const line =
|
||||
'===================================================================='
|
||||
const logPath = './duplicateRemovingLog/'
|
||||
utils.CreatePath(logPath)
|
||||
|
||||
Main()
|
||||
|
||||
async function Main() {
|
||||
const params = GetParams()
|
||||
console.log(params)
|
||||
if (params.length === 0) {
|
||||
console.error('No params! Need a path to a question database!')
|
||||
process.exit()
|
||||
}
|
||||
const data = actions.LoadJSON(params[0])
|
||||
|
||||
PrintDB(data)
|
||||
console.log(hr('='))
|
||||
|
||||
const { res, stats } = await RemoveDuplicates(data)
|
||||
console.log(hr('='))
|
||||
|
||||
LogStats(stats, data, res)
|
||||
console.log(hr('='))
|
||||
|
||||
console.log('Result database:')
|
||||
PrintDB(res)
|
||||
console.log(hr('='))
|
||||
|
||||
utils.WriteFile(JSON.stringify(res), resultFileName)
|
||||
console.log(C('green') + resultFileName + ' written!' + C())
|
||||
console.log(hr('='))
|
||||
|
||||
console.log(C('green') + 'Done' + C())
|
||||
}
|
||||
|
||||
function LogStats(stats, oldData, newData) {
|
||||
const maxSubjNameLength = MaxLengthOf(stats, 'name')
|
||||
const maxPrevLength = MaxLengthOf(stats, 'prevQuestions')
|
||||
const maxAddedLength = MaxLengthOf(stats, 'addedQuestions')
|
||||
const maxRemovedLength = MaxLengthOf(stats, 'removedQuestions')
|
||||
|
||||
stats.forEach((currStat) => {
|
||||
const { name, prevQuestions, addedQuestions, removedQuestions } = currStat
|
||||
let toLog = ''
|
||||
|
||||
toLog += C('green')
|
||||
toLog += GetExactLength(name, maxSubjNameLength)
|
||||
toLog += C()
|
||||
toLog += ' '
|
||||
toLog += C('magenta')
|
||||
toLog += GetExactLength(prevQuestions, maxPrevLength)
|
||||
toLog += C()
|
||||
toLog += C('cyan')
|
||||
toLog += ' -> '
|
||||
toLog += C()
|
||||
toLog += C('green')
|
||||
toLog += GetExactLength(addedQuestions, maxAddedLength)
|
||||
toLog += C()
|
||||
toLog += ' [ '
|
||||
toLog += C('red')
|
||||
toLog += GetExactLength(removedQuestions, maxRemovedLength)
|
||||
toLog += C()
|
||||
toLog += ' ]'
|
||||
|
||||
console.log(toLog)
|
||||
if (!dbB) {
|
||||
rmDuplicates(dbA).then((res) => {
|
||||
utils.WriteFile(JSON.stringify(res), resultDbFileName)
|
||||
console.log('File written')
|
||||
})
|
||||
} else {
|
||||
difference({ dbA: dbA, dbB: dbB }).then((res) => {
|
||||
utils.WriteFile(JSON.stringify(res), resultDbFileName)
|
||||
console.log('File written')
|
||||
})
|
||||
console.log(hr())
|
||||
console.log('Old data:')
|
||||
LogDataCount(oldData)
|
||||
console.log('New data:')
|
||||
LogDataCount(newData)
|
||||
}
|
||||
|
||||
function LogDataCount(data) {
|
||||
const subjLength = data.length
|
||||
const qLength = data.reduce((acc, subj) => {
|
||||
return acc + subj.Questions.length
|
||||
}, 0)
|
||||
async function rmDuplicates(db) {
|
||||
return await difference({ dbA: db })
|
||||
}
|
||||
|
||||
async function difference({ dbA, dbB }) {
|
||||
const doingDifference = !!dbB
|
||||
// Stuff only from A
|
||||
const resultDb = []
|
||||
let dbLength = 0
|
||||
let removedTotal = 0
|
||||
|
||||
iterateSubjects(dbA, () => {
|
||||
dbLength++
|
||||
})
|
||||
|
||||
const getResultDbLength = () => {
|
||||
let resultDbLength = 0
|
||||
iterateSubjects(resultDb, () => {
|
||||
resultDbLength++
|
||||
})
|
||||
return resultDbLength
|
||||
}
|
||||
|
||||
for (let i = 0; i < dbA.length; i++) {
|
||||
const subj = dbA[i]
|
||||
const subjLogPath = logPath + subj.Name
|
||||
utils.WriteFile('', subjLogPath)
|
||||
let removedCount = 0
|
||||
|
||||
hr()
|
||||
console.log(
|
||||
'Subjects: ' +
|
||||
C('green') +
|
||||
subjLength +
|
||||
C() +
|
||||
', Questions: ' +
|
||||
C('green') +
|
||||
qLength +
|
||||
C()
|
||||
`${C('blue')}${i + 1} / ${dbA.length}: ${C('green')}${subj.Name}, ${C(
|
||||
'blue'
|
||||
)}${subj.Questions.length}${C('green')} questions${C()}`
|
||||
)
|
||||
console.log(
|
||||
`${C('green')}Result length: ${C('blue')}${getResultDbLength()}${C(
|
||||
'green'
|
||||
)}, original length: ${C('blue')}${dbLength}${C()}`
|
||||
)
|
||||
}
|
||||
|
||||
function PrintDB(data) {
|
||||
const maxSubjNameLength = MaxLengthOf(data, 'Name')
|
||||
|
||||
data.forEach((subj) => {
|
||||
let toLog = ''
|
||||
toLog += C('green')
|
||||
toLog += GetExactLength(subj.Name, maxSubjNameLength)
|
||||
toLog += C()
|
||||
toLog += ' [ '
|
||||
toLog += C('cyan')
|
||||
toLog += subj.Questions.length
|
||||
toLog += C()
|
||||
toLog += ' ]'
|
||||
|
||||
console.log(toLog)
|
||||
})
|
||||
console.log(hr())
|
||||
LogDataCount(data)
|
||||
console.log(hr())
|
||||
}
|
||||
|
||||
function GetExactLength(string, length) {
|
||||
let toLog = string.toString()
|
||||
const lengthDiff = length - toLog.length
|
||||
for (let i = 0; i < lengthDiff; i++) {
|
||||
toLog += ' '
|
||||
}
|
||||
|
||||
return toLog
|
||||
}
|
||||
|
||||
function MaxLengthOf(prop, key) {
|
||||
return prop.reduce((acc, currStat) => {
|
||||
if (acc < currStat[key].toString().length) {
|
||||
acc = currStat[key].toString().length
|
||||
}
|
||||
return acc
|
||||
}, 0)
|
||||
}
|
||||
|
||||
async function RemoveDuplicates(data) {
|
||||
console.log(C('yellow') + 'Removing duplicates' + C())
|
||||
let res = []
|
||||
const stats = []
|
||||
|
||||
for (let i = 0; i < data.length; i++) {
|
||||
const subj = data[i]
|
||||
const logFile =
|
||||
logPath + '/' + subj.Name.replace(/ /g, '_').replace(/\//g, '-')
|
||||
let addedQuestions = 0
|
||||
let removedQuestions = 0
|
||||
for (let j = 0; j < subj.Questions.length; j++) {
|
||||
const question = subj.Questions[j]
|
||||
try {
|
||||
// Searching for same question in result database
|
||||
const tempQuestion = createQuestion(
|
||||
question.Q,
|
||||
question.A,
|
||||
question.data
|
||||
const searchRes = await search({
|
||||
qdb: doingDifference ? dbB : resultDb,
|
||||
subjName: subj.Name,
|
||||
question: question,
|
||||
searchInAllIfNoResult: doingDifference,
|
||||
})
|
||||
|
||||
printProgressBar(j + 1, subj.Questions.length)
|
||||
|
||||
const res = hasRequiredPercent(searchRes, minpercent)
|
||||
|
||||
// no result: adding to difference
|
||||
if (res.length === 0) {
|
||||
// no result: adding to difference
|
||||
addQuestion(resultDb, subj.Name, question)
|
||||
} else {
|
||||
// has result, not adding to difference
|
||||
utils.AppendToFile(
|
||||
line +
|
||||
'\n' +
|
||||
line +
|
||||
'\n' +
|
||||
JSON.stringify(question, null, 2) +
|
||||
'\n' +
|
||||
line +
|
||||
JSON.stringify(res, null, 2) +
|
||||
'\n',
|
||||
subjLogPath
|
||||
)
|
||||
let result = await searchData(res, tempQuestion)
|
||||
result = result.reduce((acc, res) => {
|
||||
if (res.match >= minMatchAmmount) {
|
||||
removedCount++
|
||||
removedTotal++
|
||||
}
|
||||
}
|
||||
console.log(
|
||||
`${C('yellow')}Removed ${C('red')}${removedCount}${C(
|
||||
'yellow'
|
||||
)} questions${C()}`
|
||||
)
|
||||
}
|
||||
|
||||
hr()
|
||||
// console.log('RESTUL', JSON.stringify(resultDb, null, 2))
|
||||
console.log(
|
||||
`Result length: ${getResultDbLength()}, original length: ${dbLength}, removed ${removedTotal} questions`
|
||||
)
|
||||
return resultDb
|
||||
}
|
||||
|
||||
function hasRequiredPercent(result, minpercent) {
|
||||
return result.reduce((acc, res) => {
|
||||
if (res.match >= minpercent) {
|
||||
acc.push(res)
|
||||
}
|
||||
return acc
|
||||
}, [])
|
||||
}
|
||||
|
||||
// if htere are more that one same questions in the new database
|
||||
if (result.length > 0) {
|
||||
utils.AppendToFile(hr('#'), logFile)
|
||||
utils.AppendToFile('QUESTION', logFile)
|
||||
utils.AppendToFile(JSON.stringify(tempQuestion, null, 2), logFile)
|
||||
utils.AppendToFile(hr(), logFile)
|
||||
utils.AppendToFile('SAMES', logFile)
|
||||
utils.AppendToFile(JSON.stringify(result, null, 2), logFile)
|
||||
removedQuestions++
|
||||
} else {
|
||||
// if no same questions are fount then adding it to then new db
|
||||
addQuestion(res, getSubjNameWithoutYear(subj.Name), tempQuestion)
|
||||
addedQuestions++
|
||||
}
|
||||
LogResultProgress(
|
||||
subj,
|
||||
i,
|
||||
j,
|
||||
subj.Questions.length,
|
||||
addedQuestions,
|
||||
removedQuestions,
|
||||
data.length
|
||||
// ---------------------------------------------------------------------------------
|
||||
// db editing tools
|
||||
// ---------------------------------------------------------------------------------
|
||||
|
||||
function search({ qdb, subjName, question, searchInAllIfNoResult }) {
|
||||
return new Promise((resolve) => {
|
||||
resolve(
|
||||
doSearch(qdb, subjName, question, null, minpercent, searchInAllIfNoResult)
|
||||
)
|
||||
} catch (err) {
|
||||
console.log()
|
||||
console.log('ERROR')
|
||||
console.log(err)
|
||||
console.log('QUESTION')
|
||||
console.log(question)
|
||||
console.log()
|
||||
}
|
||||
}
|
||||
stats.push({
|
||||
name: subj.Name,
|
||||
prevQuestions: subj.Questions.length,
|
||||
addedQuestions: addedQuestions,
|
||||
removedQuestions: removedQuestions,
|
||||
})
|
||||
}
|
||||
return { res, stats }
|
||||
|
||||
function iterateSubjects(db, fn) {
|
||||
db.forEach((subj) => {
|
||||
subj.Questions.forEach((question) => {
|
||||
fn(subj, question)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
function LogResultProgress(
|
||||
subj,
|
||||
i,
|
||||
j,
|
||||
length,
|
||||
addedQuestions,
|
||||
removedQuestions,
|
||||
subjCount
|
||||
) {
|
||||
process.stdout.write(
|
||||
'[ ' +
|
||||
C('cyan') +
|
||||
(i + 1) +
|
||||
C() +
|
||||
' / ' +
|
||||
C('green') +
|
||||
subjCount +
|
||||
C() +
|
||||
' ] ' +
|
||||
C('yellow') +
|
||||
subj.Name +
|
||||
C() +
|
||||
': ' +
|
||||
C('green') +
|
||||
subj.Questions.length +
|
||||
' ' +
|
||||
C('cyan') +
|
||||
'-> ' +
|
||||
C('green') +
|
||||
addedQuestions +
|
||||
C() +
|
||||
', removed: ' +
|
||||
C('red') +
|
||||
removedQuestions +
|
||||
C()
|
||||
)
|
||||
// ---------------------------------------------------------------------------------
|
||||
// logging and tools
|
||||
// ---------------------------------------------------------------------------------
|
||||
|
||||
if (j === length - 1) {
|
||||
process.stdout.write('\n')
|
||||
} else {
|
||||
function hr() {
|
||||
let res = ''
|
||||
for (let i = 0; i < process.stdout.columns; i++) {
|
||||
res += '='
|
||||
}
|
||||
console.log(`${C('cyan')}${res}${C()}`)
|
||||
}
|
||||
|
||||
function writeInSameLine(text, returnToLineStart) {
|
||||
process.stdout.write(text)
|
||||
if (returnToLineStart) {
|
||||
process.stdout.write('\r')
|
||||
} else {
|
||||
process.stdout.write('\n')
|
||||
}
|
||||
}
|
||||
|
||||
function hr(char) {
|
||||
let hr = ''
|
||||
const cols = process.stdout.columns || 20
|
||||
for (let i = 0; i < cols; i++) {
|
||||
hr += char || '-'
|
||||
function printProgressBar(current, total) {
|
||||
const width = process.stdout.columns - 25
|
||||
|
||||
if (width <= 0) {
|
||||
return
|
||||
}
|
||||
return hr
|
||||
|
||||
const x = width / total
|
||||
const xCurrent = Math.floor(current * x)
|
||||
const xTotal = Math.floor(total * x)
|
||||
|
||||
let line = ''
|
||||
for (let i = 0; i < xCurrent; i++) {
|
||||
line += '='
|
||||
}
|
||||
|
||||
for (let i = 0; i < xTotal - xCurrent; i++) {
|
||||
line += ' '
|
||||
}
|
||||
const numbers = `${current} / ${total}`
|
||||
writeInSameLine(
|
||||
`${C('magenta')} [${line}]${C('green')} ${numbers}${C()}`,
|
||||
current !== total
|
||||
)
|
||||
}
|
||||
|
||||
function C(color) {
|
||||
return logger.C(color)
|
||||
}
|
||||
|
||||
function GetParams() {
|
||||
return process.argv.splice(2)
|
||||
}
|
||||
|
||||
function GetDateString() {
|
||||
const date = new Date()
|
||||
const dateString =
|
||||
date.getFullYear() +
|
||||
'-' +
|
||||
('0' + (date.getMonth() + 1)).slice(-2) +
|
||||
'-' +
|
||||
('0' + date.getDate()).slice(-2) +
|
||||
' ' +
|
||||
('0' + date.getHours()).slice(-2) +
|
||||
':' +
|
||||
('0' + date.getMinutes()).slice(-2) +
|
||||
':' +
|
||||
('0' + date.getSeconds()).slice(-2)
|
||||
return dateString
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue