Search speedup by: caching splitted questions/answers, and refactoring string compare algorithym

This commit is contained in:
mrfry 2021-03-17 12:24:50 +01:00
parent 043e825302
commit 8fdc62349b
6 changed files with 152 additions and 86 deletions

View file

@ -33,6 +33,7 @@ import {
processIncomingRequest, processIncomingRequest,
logResult, logResult,
backupData, backupData,
writeData,
shouldSaveDataFile, shouldSaveDataFile,
shouldSearchDataFile, shouldSearchDataFile,
loadJSON, loadJSON,
@ -1364,6 +1365,7 @@ function GetApp(): ModuleType {
function deleteComment(obj, path) { function deleteComment(obj, path) {
if (path.length === 1) { if (path.length === 1) {
// TODO: check if its actually deleteable by user (deleting other users comments)
obj.splice(path[0], 1) obj.splice(path[0], 1)
} else { } else {
const i = path.pop() const i = path.pop()
@ -1888,7 +1890,7 @@ function GetApp(): ModuleType {
} }
if (saveDb) { if (saveDb) {
utils.WriteFile(JSON.stringify(currDb.data), currDb.path) writeData(currDb.data, currDb.path)
msgAllWorker({ msgAllWorker({
qdbs: questionDbs, qdbs: questionDbs,
type: 'update', type: 'update',

View file

@ -1,37 +1,41 @@
const minpercent = 97
const resultDbFileName = 'res.json'
// ---------------------------------------------------------------------------------------------------
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
const { loadData } = require('../../dist/utils/actions.js') // eslint-disable-line
// TODO: merge 2 dbs const minpercent = 95
// TODO: filter questions out from a db based on another, producing a new one const resultDbFileName = 'res.json'
const params = process.argv.splice(2)
console.log('Params', params)
const fileA = params[0]
const fileB = params[1]
const dbA = utils.ReadJSON(fileA)
const dbB = fileB ? utils.ReadJSON(fileB) : null
const line = const line =
'====================================================================' '===================================================================='
const logPath = './duplicateRemovingLog/' const logPath = './duplicateRemovingLog/'
utils.CreatePath(logPath) utils.CreatePath(logPath)
const params = process.argv.splice(2)
const fileA = params[0]
const fileB = params[1]
console.time('load')
const dbA = loadData(fileA)
const dbB = fileB ? loadData(fileB) : null
console.timeEnd('load')
console.time('rmduplicates')
if (!dbB) { if (!dbB) {
console.log(`Removing duplicate questions from ${fileA}`)
rmDuplicates(dbA).then((res) => { rmDuplicates(dbA).then((res) => {
console.timeEnd('rmduplicates')
utils.WriteFile(JSON.stringify(res), resultDbFileName) utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written') console.log('File written')
}) })
} else { } else {
console.log(
`Removing questions found in ${C('green')}${fileB}${C()} from ${C(
'green'
)}${fileA}${C()}`
)
difference({ dbA: dbA, dbB: dbB }).then((res) => { difference({ dbA: dbA, dbB: dbB }).then((res) => {
console.timeEnd('rmduplicates')
utils.WriteFile(JSON.stringify(res), resultDbFileName) utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written') console.log('File written')
}) })
@ -85,6 +89,7 @@ async function difference({ dbA, dbB }) {
subjName: subj.Name, subjName: subj.Name,
question: question, question: question,
searchInAllIfNoResult: doingDifference, searchInAllIfNoResult: doingDifference,
searchTillMatchPercent: minpercent,
}) })
printProgressBar(j + 1, subj.Questions.length) printProgressBar(j + 1, subj.Questions.length)

View file

@ -10,6 +10,10 @@ export interface Question {
Q: string Q: string
A: string A: string
data: QuestionData data: QuestionData
cache?: {
Q: string
A: string
}
} }
export interface Subject { export interface Subject {

View file

@ -28,7 +28,13 @@ import utils from '../utils/utils'
import { SearchResult, addQuestion, getSubjNameWithoutYear } from './classes' import { SearchResult, addQuestion, getSubjNameWithoutYear } from './classes'
// types // types
import { QuestionDb, Question, User, DataFile } from '../types/basicTypes' import {
QuestionDb,
Subject,
Question,
User,
DataFile,
} from '../types/basicTypes'
// if a recievend question doesnt match at least this % to any other question in the db it gets // if a recievend question doesnt match at least this % to any other question in the db it gets
// added to db // added to db
@ -219,7 +225,7 @@ function processIncomingRequestUsingDb(
if (currWrites >= writeAfter && !dryRun) { if (currWrites >= writeAfter && !dryRun) {
currWrites = 0 currWrites = 0
logger.DebugLog('Writing data.json', 'isadding', 1) logger.DebugLog('Writing data.json', 'isadding', 1)
utils.WriteFile(JSON.stringify(qdb.data), qdb.path) writeData(qdb.data, qdb.path)
} }
} }
@ -335,6 +341,20 @@ export function shouldSaveDataFile(
return false return false
} }
export function loadData(path: string): Array<Subject> {
return JSON.parse(utils.ReadFile(path)).reduce((acc, subj) => {
return [
...acc,
{
Name: subj.Name,
Questions: subj.Questions.map((question) => {
return createQuestion(question)
}),
},
]
}, [])
}
export function loadJSON( export function loadJSON(
dataFiles: Array<DataFile>, dataFiles: Array<DataFile>,
dataDir: string dataDir: string
@ -351,7 +371,7 @@ export function loadJSON(
...dataFile, ...dataFile,
path: dataPath, path: dataPath,
index: index, index: index,
data: JSON.parse(utils.ReadFile(dataPath)), data: loadData(dataPath),
}) })
} catch (err) { } catch (err) {
console.error(err) console.error(err)
@ -364,14 +384,34 @@ export function loadJSON(
}, []) }, [])
} }
export function writeData(data: Array<Subject>, path: string): void {
utils.WriteFile(
JSON.stringify(
data.map((subj) => {
return {
Name: subj.Name,
Questions: subj.Questions.map((question) => {
return {
Q: question.Q,
A: question.A,
data: question.data,
}
}),
}
})
),
path
)
}
export function backupData(questionDbs: Array<QuestionDb>): void { export function backupData(questionDbs: Array<QuestionDb>): void {
questionDbs.forEach((data) => { questionDbs.forEach((data) => {
const path = './publicDirs/qminingPublic/backs/' const path = './publicDirs/qminingPublic/backs/'
utils.CreatePath(path) utils.CreatePath(path)
try { try {
logger.Log(`Backing up ${data.name}...`) logger.Log(`Backing up ${data.name}...`)
utils.WriteFile( writeData(
JSON.stringify(data.data), data.data,
`${path}${data.name}_${utils.GetDateString(true)}.json` `${path}${data.name}_${utils.GetDateString(true)}.json`
) )
logger.Log('Done') logger.Log('Done')

View file

@ -28,12 +28,12 @@ const commonUselessAnswerParts = [
"'", "'",
] ]
const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.'] // const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.']
const specialChars = ['&', '\\+']
/* Percent minus for length difference */ /* Percent minus for length difference */
const lengthDiffMultiplier = 10 const lengthDiffMultiplier = 10
/* Minimum ammount to consider that two questions match during answering */ /* Minimum ammount to consider that two questions match during answering */
const minMatchAmmount = 70 const minMatchAmmount = 70
const magicNumber = 0.7 // same as minMatchAmmount, but /100
/* If all of the results are below this match percent (when only one subject is searched due to /* If all of the results are below this match percent (when only one subject is searched due to
* subject name matching) then all subjects are searched for answer */ * subject name matching) then all subjects are searched for answer */
const minMatchToNotSearchOtherSubjects = 90 const minMatchToNotSearchOtherSubjects = 90
@ -55,6 +55,14 @@ function getSubjNameWithoutYear(subjName: string): string {
// Not exported // Not exported
// --------------------------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------------------------
function simplifyString(toremove) {
return toremove
.replace(/\s/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase()
}
function removeStuff( function removeStuff(
value: string, value: string,
removableStrings: Array<string>, removableStrings: Array<string>,
@ -67,55 +75,49 @@ function removeStuff(
return value return value
} }
// removes whitespace from begining and and, and replaces multiple spaces with one space
function removeUnnecesarySpaces(toremove: string) {
assert(toremove)
toremove = normalizeSpaces(toremove)
while (toremove.includes(' ')) {
toremove = toremove.replace(/ {2}/g, ' ')
}
return toremove.trim()
}
// simplifies a string for easier comparison
function simplifyStringForComparison(value: string) {
assert(value)
value = removeUnnecesarySpaces(value).toLowerCase()
return removeStuff(value, commonUselessStringParts)
}
function removeSpecialChars(value: string) {
assert(value)
return removeStuff(value, specialChars, ' ')
}
// damn nonbreaking space // damn nonbreaking space
function normalizeSpaces(input: string) { function normalizeSpaces(input) {
assert(input)
return input.replace(/\s/g, ' ') return input.replace(/\s/g, ' ')
} }
function compareString(string1: string, string2: string) { function removeUnnecesarySpaces(toremove: string) {
if (!string1 || !string2) { return normalizeSpaces(toremove).replace(/\s+/g, ' ')
if (!string1 && !string2) { }
function compareString(s1, s2) {
if (!s1 || !s2) {
if (!s1 && !s2) {
return 100
} else {
return 0
}
}
if (s1.length < 0 || s2.length < 0) {
if (s1.length === 0 && s2.length === 0) {
return 100 return 100
} else { } else {
return 0 return 0
} }
} }
const s1 = simplifyStringForComparison(string1).split(' ')
const s2 = simplifyStringForComparison(string2).split(' ')
let match = 0 let match = 0
for (let i = 0; i < s1.length; i++) { let lastMatchIndex = -1
if (s2.includes(s1[i])) { let i = 0
match++
while (i < s1.length) {
if (match / i < magicNumber) {
break
} }
const currMatchIndex = s2.indexOf(s1[i])
if (lastMatchIndex < currMatchIndex) {
match++
lastMatchIndex = currMatchIndex
}
i++
} }
let percent = Math.round(parseFloat(((match / s1.length) * 100).toFixed(2))) let percent = Math.round(parseFloat(((match / s1.length) * 100).toFixed(2)))
const lengthDifference = Math.abs(s2.length - s1.length) const lengthDifference = Math.abs(s2.length - s1.length)
percent -= lengthDifference * lengthDiffMultiplier percent -= lengthDifference * lengthDiffMultiplier
@ -163,7 +165,6 @@ function simplifyAnswer(value: string) {
return value return value
} }
return simplifyQA(value, [ return simplifyQA(value, [
removeSpecialChars,
removeUnnecesarySpaces, removeUnnecesarySpaces,
answerPreProcessor, answerPreProcessor,
removeAnswerLetters, removeAnswerLetters,
@ -175,22 +176,16 @@ function simplifyQuestion(question: Question | string) {
return return
} }
if (typeof question === 'string') { if (typeof question === 'string') {
return simplifyQA(question, [ return simplifyQA(question, [removeUnnecesarySpaces, removeAnswerLetters])
removeSpecialChars,
removeUnnecesarySpaces,
removeAnswerLetters,
])
} else { } else {
if (question.Q) { if (question.Q) {
question.Q = simplifyQA(question.Q, [ question.Q = simplifyQA(question.Q, [
removeSpecialChars,
removeUnnecesarySpaces, removeUnnecesarySpaces,
removeAnswerLetters, removeAnswerLetters,
]) ])
} }
if (question.A) { if (question.A) {
question.A = simplifyQA(question.A, [ question.A = simplifyQA(question.A, [
removeSpecialChars,
removeUnnecesarySpaces, removeUnnecesarySpaces,
removeAnswerLetters, removeAnswerLetters,
]) ])
@ -205,13 +200,29 @@ function simplifyQuestion(question: Question | string) {
function createQuestion( function createQuestion(
question: Question | string, question: Question | string,
answer: string, answer?: string,
data: QuestionData data?: QuestionData
): Question { ): Question {
return { try {
Q: simplifyQuestion(question), if (typeof question === 'string') {
A: answer ? simplifyAnswer(answer) : undefined, return {
data: data, Q: simplifyQuestion(question),
A: answer ? simplifyAnswer(answer) : undefined,
data: data,
}
} else {
return {
...question,
cache: {
Q: question.Q ? simplifyString(question.Q).split(' ') : [],
A: question.A ? simplifyString(question.A).split(' ') : [],
},
}
}
} catch (err) {
logger.Log('Error creating question', logger.GetColor('redbg'))
console.error(question, answer, data)
console.error(err)
} }
} }
@ -257,11 +268,11 @@ function compareData(q1: Question, q2: Question) {
} }
function compareQuestion(q1: Question, q2: Question) { function compareQuestion(q1: Question, q2: Question) {
return compareString(q1.Q, q2.Q) return compareString(q1.cache.Q, q2.cache.Q)
} }
function compareAnswer(q1: Question, q2: Question) { function compareAnswer(q1: Question, q2: Question) {
return compareString(q1.A, q2.A) return compareString(q1.cache.A, q2.cache.A)
} }
function compareQuestionObj( function compareQuestionObj(
@ -328,7 +339,11 @@ function searchSubject(
assert(question) assert(question)
let result = [] let result = []
subj.Questions.every((currentQuestion) => {
let stopSearch = false
let i = subj.Questions.length - 1
while (i >= 0 && !stopSearch) {
const currentQuestion = subj.Questions[i]
const percent = compareQuestionObj( const percent = compareQuestionObj(
currentQuestion, currentQuestion,
subjName, subjName,
@ -337,7 +352,7 @@ function searchSubject(
question.data question.data
) )
if (percent.avg > minMatchAmmount) { if (percent.avg >= minMatchAmmount) {
result.push({ result.push({
q: currentQuestion, q: currentQuestion,
match: percent.avg, match: percent.avg,
@ -346,11 +361,11 @@ function searchSubject(
} }
if (searchTillMatchPercent && percent.avg >= searchTillMatchPercent) { if (searchTillMatchPercent && percent.avg >= searchTillMatchPercent) {
return false stopSearch = true
} }
return true i--
}) }
result = result.sort((q1, q2) => { result = result.sort((q1, q2) => {
if (q1.match < q2.match) { if (q1.match < q2.match) {
@ -421,9 +436,9 @@ function prepareQuestion(
let preparedQuestion: Question let preparedQuestion: Question
if (typeof question === 'object') { if (typeof question === 'object') {
preparedQuestion = question preparedQuestion = createQuestion(question)
} else { } else {
let parsedData let parsedData: any
if (typeof data === 'string') { if (typeof data === 'string') {
try { try {
parsedData = JSON.parse(data) parsedData = JSON.parse(data)

@ -1 +1 @@
Subproject commit 49eae83f8194ab9585939b93119f82f7c0da16bb Subproject commit 7f4163736cc0bfed3259f39f7bc0063ca191da21