Search speedup by: caching splitted questions/answers, and refactoring string compare algorithym

This commit is contained in:
mrfry 2021-03-17 12:24:50 +01:00
parent 043e825302
commit 8fdc62349b
6 changed files with 152 additions and 86 deletions

View file

@ -33,6 +33,7 @@ import {
processIncomingRequest,
logResult,
backupData,
writeData,
shouldSaveDataFile,
shouldSearchDataFile,
loadJSON,
@ -1364,6 +1365,7 @@ function GetApp(): ModuleType {
function deleteComment(obj, path) {
if (path.length === 1) {
// TODO: check if its actually deleteable by user (deleting other users comments)
obj.splice(path[0], 1)
} else {
const i = path.pop()
@ -1888,7 +1890,7 @@ function GetApp(): ModuleType {
}
if (saveDb) {
utils.WriteFile(JSON.stringify(currDb.data), currDb.path)
writeData(currDb.data, currDb.path)
msgAllWorker({
qdbs: questionDbs,
type: 'update',

View file

@ -1,37 +1,41 @@
const minpercent = 97
const resultDbFileName = 'res.json'
// ---------------------------------------------------------------------------------------------------
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
const { loadData } = require('../../dist/utils/actions.js') // eslint-disable-line
// TODO: merge 2 dbs
// TODO: filter questions out from a db based on another, producing a new one
const params = process.argv.splice(2)
console.log('Params', params)
const fileA = params[0]
const fileB = params[1]
const dbA = utils.ReadJSON(fileA)
const dbB = fileB ? utils.ReadJSON(fileB) : null
const minpercent = 95
const resultDbFileName = 'res.json'
const line =
'===================================================================='
const logPath = './duplicateRemovingLog/'
utils.CreatePath(logPath)
const params = process.argv.splice(2)
const fileA = params[0]
const fileB = params[1]
console.time('load')
const dbA = loadData(fileA)
const dbB = fileB ? loadData(fileB) : null
console.timeEnd('load')
console.time('rmduplicates')
if (!dbB) {
console.log(`Removing duplicate questions from ${fileA}`)
rmDuplicates(dbA).then((res) => {
console.timeEnd('rmduplicates')
utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written')
})
} else {
console.log(
`Removing questions found in ${C('green')}${fileB}${C()} from ${C(
'green'
)}${fileA}${C()}`
)
difference({ dbA: dbA, dbB: dbB }).then((res) => {
console.timeEnd('rmduplicates')
utils.WriteFile(JSON.stringify(res), resultDbFileName)
console.log('File written')
})
@ -85,6 +89,7 @@ async function difference({ dbA, dbB }) {
subjName: subj.Name,
question: question,
searchInAllIfNoResult: doingDifference,
searchTillMatchPercent: minpercent,
})
printProgressBar(j + 1, subj.Questions.length)

View file

@ -10,6 +10,10 @@ export interface Question {
Q: string
A: string
data: QuestionData
cache?: {
Q: string
A: string
}
}
export interface Subject {

View file

@ -28,7 +28,13 @@ import utils from '../utils/utils'
import { SearchResult, addQuestion, getSubjNameWithoutYear } from './classes'
// types
import { QuestionDb, Question, User, DataFile } from '../types/basicTypes'
import {
QuestionDb,
Subject,
Question,
User,
DataFile,
} from '../types/basicTypes'
// if a recievend question doesnt match at least this % to any other question in the db it gets
// added to db
@ -219,7 +225,7 @@ function processIncomingRequestUsingDb(
if (currWrites >= writeAfter && !dryRun) {
currWrites = 0
logger.DebugLog('Writing data.json', 'isadding', 1)
utils.WriteFile(JSON.stringify(qdb.data), qdb.path)
writeData(qdb.data, qdb.path)
}
}
@ -335,6 +341,20 @@ export function shouldSaveDataFile(
return false
}
export function loadData(path: string): Array<Subject> {
return JSON.parse(utils.ReadFile(path)).reduce((acc, subj) => {
return [
...acc,
{
Name: subj.Name,
Questions: subj.Questions.map((question) => {
return createQuestion(question)
}),
},
]
}, [])
}
export function loadJSON(
dataFiles: Array<DataFile>,
dataDir: string
@ -351,7 +371,7 @@ export function loadJSON(
...dataFile,
path: dataPath,
index: index,
data: JSON.parse(utils.ReadFile(dataPath)),
data: loadData(dataPath),
})
} catch (err) {
console.error(err)
@ -364,14 +384,34 @@ export function loadJSON(
}, [])
}
export function writeData(data: Array<Subject>, path: string): void {
utils.WriteFile(
JSON.stringify(
data.map((subj) => {
return {
Name: subj.Name,
Questions: subj.Questions.map((question) => {
return {
Q: question.Q,
A: question.A,
data: question.data,
}
}),
}
})
),
path
)
}
export function backupData(questionDbs: Array<QuestionDb>): void {
questionDbs.forEach((data) => {
const path = './publicDirs/qminingPublic/backs/'
utils.CreatePath(path)
try {
logger.Log(`Backing up ${data.name}...`)
utils.WriteFile(
JSON.stringify(data.data),
writeData(
data.data,
`${path}${data.name}_${utils.GetDateString(true)}.json`
)
logger.Log('Done')

View file

@ -28,12 +28,12 @@ const commonUselessAnswerParts = [
"'",
]
const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.']
const specialChars = ['&', '\\+']
// const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.']
/* Percent minus for length difference */
const lengthDiffMultiplier = 10
/* Minimum ammount to consider that two questions match during answering */
const minMatchAmmount = 70
const magicNumber = 0.7 // same as minMatchAmmount, but /100
/* If all of the results are below this match percent (when only one subject is searched due to
* subject name matching) then all subjects are searched for answer */
const minMatchToNotSearchOtherSubjects = 90
@ -55,6 +55,14 @@ function getSubjNameWithoutYear(subjName: string): string {
// Not exported
// ---------------------------------------------------------------------------------------------------------
function simplifyString(toremove) {
return toremove
.replace(/\s/g, ' ')
.replace(/\s+/g, ' ')
.toLowerCase()
}
function removeStuff(
value: string,
removableStrings: Array<string>,
@ -67,55 +75,49 @@ function removeStuff(
return value
}
// removes whitespace from begining and and, and replaces multiple spaces with one space
function removeUnnecesarySpaces(toremove: string) {
assert(toremove)
toremove = normalizeSpaces(toremove)
while (toremove.includes(' ')) {
toremove = toremove.replace(/ {2}/g, ' ')
}
return toremove.trim()
}
// simplifies a string for easier comparison
function simplifyStringForComparison(value: string) {
assert(value)
value = removeUnnecesarySpaces(value).toLowerCase()
return removeStuff(value, commonUselessStringParts)
}
function removeSpecialChars(value: string) {
assert(value)
return removeStuff(value, specialChars, ' ')
}
// damn nonbreaking space
function normalizeSpaces(input: string) {
assert(input)
function normalizeSpaces(input) {
return input.replace(/\s/g, ' ')
}
function compareString(string1: string, string2: string) {
if (!string1 || !string2) {
if (!string1 && !string2) {
function removeUnnecesarySpaces(toremove: string) {
return normalizeSpaces(toremove).replace(/\s+/g, ' ')
}
function compareString(s1, s2) {
if (!s1 || !s2) {
if (!s1 && !s2) {
return 100
} else {
return 0
}
}
if (s1.length < 0 || s2.length < 0) {
if (s1.length === 0 && s2.length === 0) {
return 100
} else {
return 0
}
}
const s1 = simplifyStringForComparison(string1).split(' ')
const s2 = simplifyStringForComparison(string2).split(' ')
let match = 0
for (let i = 0; i < s1.length; i++) {
if (s2.includes(s1[i])) {
match++
let lastMatchIndex = -1
let i = 0
while (i < s1.length) {
if (match / i < magicNumber) {
break
}
const currMatchIndex = s2.indexOf(s1[i])
if (lastMatchIndex < currMatchIndex) {
match++
lastMatchIndex = currMatchIndex
}
i++
}
let percent = Math.round(parseFloat(((match / s1.length) * 100).toFixed(2)))
const lengthDifference = Math.abs(s2.length - s1.length)
percent -= lengthDifference * lengthDiffMultiplier
@ -163,7 +165,6 @@ function simplifyAnswer(value: string) {
return value
}
return simplifyQA(value, [
removeSpecialChars,
removeUnnecesarySpaces,
answerPreProcessor,
removeAnswerLetters,
@ -175,22 +176,16 @@ function simplifyQuestion(question: Question | string) {
return
}
if (typeof question === 'string') {
return simplifyQA(question, [
removeSpecialChars,
removeUnnecesarySpaces,
removeAnswerLetters,
])
return simplifyQA(question, [removeUnnecesarySpaces, removeAnswerLetters])
} else {
if (question.Q) {
question.Q = simplifyQA(question.Q, [
removeSpecialChars,
removeUnnecesarySpaces,
removeAnswerLetters,
])
}
if (question.A) {
question.A = simplifyQA(question.A, [
removeSpecialChars,
removeUnnecesarySpaces,
removeAnswerLetters,
])
@ -205,13 +200,29 @@ function simplifyQuestion(question: Question | string) {
function createQuestion(
question: Question | string,
answer: string,
data: QuestionData
answer?: string,
data?: QuestionData
): Question {
return {
Q: simplifyQuestion(question),
A: answer ? simplifyAnswer(answer) : undefined,
data: data,
try {
if (typeof question === 'string') {
return {
Q: simplifyQuestion(question),
A: answer ? simplifyAnswer(answer) : undefined,
data: data,
}
} else {
return {
...question,
cache: {
Q: question.Q ? simplifyString(question.Q).split(' ') : [],
A: question.A ? simplifyString(question.A).split(' ') : [],
},
}
}
} catch (err) {
logger.Log('Error creating question', logger.GetColor('redbg'))
console.error(question, answer, data)
console.error(err)
}
}
@ -257,11 +268,11 @@ function compareData(q1: Question, q2: Question) {
}
function compareQuestion(q1: Question, q2: Question) {
return compareString(q1.Q, q2.Q)
return compareString(q1.cache.Q, q2.cache.Q)
}
function compareAnswer(q1: Question, q2: Question) {
return compareString(q1.A, q2.A)
return compareString(q1.cache.A, q2.cache.A)
}
function compareQuestionObj(
@ -328,7 +339,11 @@ function searchSubject(
assert(question)
let result = []
subj.Questions.every((currentQuestion) => {
let stopSearch = false
let i = subj.Questions.length - 1
while (i >= 0 && !stopSearch) {
const currentQuestion = subj.Questions[i]
const percent = compareQuestionObj(
currentQuestion,
subjName,
@ -337,7 +352,7 @@ function searchSubject(
question.data
)
if (percent.avg > minMatchAmmount) {
if (percent.avg >= minMatchAmmount) {
result.push({
q: currentQuestion,
match: percent.avg,
@ -346,11 +361,11 @@ function searchSubject(
}
if (searchTillMatchPercent && percent.avg >= searchTillMatchPercent) {
return false
stopSearch = true
}
return true
})
i--
}
result = result.sort((q1, q2) => {
if (q1.match < q2.match) {
@ -421,9 +436,9 @@ function prepareQuestion(
let preparedQuestion: Question
if (typeof question === 'object') {
preparedQuestion = question
preparedQuestion = createQuestion(question)
} else {
let parsedData
let parsedData: any
if (typeof data === 'string') {
try {
parsedData = JSON.parse(data)

@ -1 +1 @@
Subproject commit 49eae83f8194ab9585939b93119f82f7c0da16bb
Subproject commit 7f4163736cc0bfed3259f39f7bc0063ca191da21