mirror of
https://gitlab.com/MrFry/mrfrys-node-server
synced 2025-04-01 20:24:18 +02:00
Search speedup by: caching splitted questions/answers, and refactoring string compare algorithym
This commit is contained in:
parent
043e825302
commit
8fdc62349b
6 changed files with 152 additions and 86 deletions
|
@ -33,6 +33,7 @@ import {
|
|||
processIncomingRequest,
|
||||
logResult,
|
||||
backupData,
|
||||
writeData,
|
||||
shouldSaveDataFile,
|
||||
shouldSearchDataFile,
|
||||
loadJSON,
|
||||
|
@ -1364,6 +1365,7 @@ function GetApp(): ModuleType {
|
|||
|
||||
function deleteComment(obj, path) {
|
||||
if (path.length === 1) {
|
||||
// TODO: check if its actually deleteable by user (deleting other users comments)
|
||||
obj.splice(path[0], 1)
|
||||
} else {
|
||||
const i = path.pop()
|
||||
|
@ -1888,7 +1890,7 @@ function GetApp(): ModuleType {
|
|||
}
|
||||
|
||||
if (saveDb) {
|
||||
utils.WriteFile(JSON.stringify(currDb.data), currDb.path)
|
||||
writeData(currDb.data, currDb.path)
|
||||
msgAllWorker({
|
||||
qdbs: questionDbs,
|
||||
type: 'update',
|
||||
|
|
|
@ -1,37 +1,41 @@
|
|||
const minpercent = 97
|
||||
const resultDbFileName = 'res.json'
|
||||
|
||||
// ---------------------------------------------------------------------------------------------------
|
||||
|
||||
const utils = require('../../dist/utils/utils.js').default // eslint-disable-line
|
||||
const logger = require('../../dist/utils/logger.js').default // eslint-disable-line
|
||||
const { addQuestion, doSearch } = require('../../dist/utils/classes.js') // eslint-disable-line
|
||||
const { loadData } = require('../../dist/utils/actions.js') // eslint-disable-line
|
||||
|
||||
// TODO: merge 2 dbs
|
||||
// TODO: filter questions out from a db based on another, producing a new one
|
||||
|
||||
const params = process.argv.splice(2)
|
||||
|
||||
console.log('Params', params)
|
||||
|
||||
const fileA = params[0]
|
||||
const fileB = params[1]
|
||||
|
||||
const dbA = utils.ReadJSON(fileA)
|
||||
const dbB = fileB ? utils.ReadJSON(fileB) : null
|
||||
|
||||
const minpercent = 95
|
||||
const resultDbFileName = 'res.json'
|
||||
const line =
|
||||
'===================================================================='
|
||||
const logPath = './duplicateRemovingLog/'
|
||||
utils.CreatePath(logPath)
|
||||
|
||||
const params = process.argv.splice(2)
|
||||
|
||||
const fileA = params[0]
|
||||
const fileB = params[1]
|
||||
|
||||
console.time('load')
|
||||
const dbA = loadData(fileA)
|
||||
const dbB = fileB ? loadData(fileB) : null
|
||||
console.timeEnd('load')
|
||||
|
||||
console.time('rmduplicates')
|
||||
if (!dbB) {
|
||||
console.log(`Removing duplicate questions from ${fileA}`)
|
||||
rmDuplicates(dbA).then((res) => {
|
||||
console.timeEnd('rmduplicates')
|
||||
utils.WriteFile(JSON.stringify(res), resultDbFileName)
|
||||
console.log('File written')
|
||||
})
|
||||
} else {
|
||||
console.log(
|
||||
`Removing questions found in ${C('green')}${fileB}${C()} from ${C(
|
||||
'green'
|
||||
)}${fileA}${C()}`
|
||||
)
|
||||
difference({ dbA: dbA, dbB: dbB }).then((res) => {
|
||||
console.timeEnd('rmduplicates')
|
||||
utils.WriteFile(JSON.stringify(res), resultDbFileName)
|
||||
console.log('File written')
|
||||
})
|
||||
|
@ -85,6 +89,7 @@ async function difference({ dbA, dbB }) {
|
|||
subjName: subj.Name,
|
||||
question: question,
|
||||
searchInAllIfNoResult: doingDifference,
|
||||
searchTillMatchPercent: minpercent,
|
||||
})
|
||||
|
||||
printProgressBar(j + 1, subj.Questions.length)
|
||||
|
|
|
@ -10,6 +10,10 @@ export interface Question {
|
|||
Q: string
|
||||
A: string
|
||||
data: QuestionData
|
||||
cache?: {
|
||||
Q: string
|
||||
A: string
|
||||
}
|
||||
}
|
||||
|
||||
export interface Subject {
|
||||
|
|
|
@ -28,7 +28,13 @@ import utils from '../utils/utils'
|
|||
import { SearchResult, addQuestion, getSubjNameWithoutYear } from './classes'
|
||||
|
||||
// types
|
||||
import { QuestionDb, Question, User, DataFile } from '../types/basicTypes'
|
||||
import {
|
||||
QuestionDb,
|
||||
Subject,
|
||||
Question,
|
||||
User,
|
||||
DataFile,
|
||||
} from '../types/basicTypes'
|
||||
|
||||
// if a recievend question doesnt match at least this % to any other question in the db it gets
|
||||
// added to db
|
||||
|
@ -219,7 +225,7 @@ function processIncomingRequestUsingDb(
|
|||
if (currWrites >= writeAfter && !dryRun) {
|
||||
currWrites = 0
|
||||
logger.DebugLog('Writing data.json', 'isadding', 1)
|
||||
utils.WriteFile(JSON.stringify(qdb.data), qdb.path)
|
||||
writeData(qdb.data, qdb.path)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -335,6 +341,20 @@ export function shouldSaveDataFile(
|
|||
return false
|
||||
}
|
||||
|
||||
export function loadData(path: string): Array<Subject> {
|
||||
return JSON.parse(utils.ReadFile(path)).reduce((acc, subj) => {
|
||||
return [
|
||||
...acc,
|
||||
{
|
||||
Name: subj.Name,
|
||||
Questions: subj.Questions.map((question) => {
|
||||
return createQuestion(question)
|
||||
}),
|
||||
},
|
||||
]
|
||||
}, [])
|
||||
}
|
||||
|
||||
export function loadJSON(
|
||||
dataFiles: Array<DataFile>,
|
||||
dataDir: string
|
||||
|
@ -351,7 +371,7 @@ export function loadJSON(
|
|||
...dataFile,
|
||||
path: dataPath,
|
||||
index: index,
|
||||
data: JSON.parse(utils.ReadFile(dataPath)),
|
||||
data: loadData(dataPath),
|
||||
})
|
||||
} catch (err) {
|
||||
console.error(err)
|
||||
|
@ -364,14 +384,34 @@ export function loadJSON(
|
|||
}, [])
|
||||
}
|
||||
|
||||
export function writeData(data: Array<Subject>, path: string): void {
|
||||
utils.WriteFile(
|
||||
JSON.stringify(
|
||||
data.map((subj) => {
|
||||
return {
|
||||
Name: subj.Name,
|
||||
Questions: subj.Questions.map((question) => {
|
||||
return {
|
||||
Q: question.Q,
|
||||
A: question.A,
|
||||
data: question.data,
|
||||
}
|
||||
}),
|
||||
}
|
||||
})
|
||||
),
|
||||
path
|
||||
)
|
||||
}
|
||||
|
||||
export function backupData(questionDbs: Array<QuestionDb>): void {
|
||||
questionDbs.forEach((data) => {
|
||||
const path = './publicDirs/qminingPublic/backs/'
|
||||
utils.CreatePath(path)
|
||||
try {
|
||||
logger.Log(`Backing up ${data.name}...`)
|
||||
utils.WriteFile(
|
||||
JSON.stringify(data.data),
|
||||
writeData(
|
||||
data.data,
|
||||
`${path}${data.name}_${utils.GetDateString(true)}.json`
|
||||
)
|
||||
logger.Log('Done')
|
||||
|
|
|
@ -28,12 +28,12 @@ const commonUselessAnswerParts = [
|
|||
"'",
|
||||
]
|
||||
|
||||
const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.']
|
||||
const specialChars = ['&', '\\+']
|
||||
// const commonUselessStringParts = [',', '\\.', ':', '!', '\\+', '\\s*\\.']
|
||||
/* Percent minus for length difference */
|
||||
const lengthDiffMultiplier = 10
|
||||
/* Minimum ammount to consider that two questions match during answering */
|
||||
const minMatchAmmount = 70
|
||||
const magicNumber = 0.7 // same as minMatchAmmount, but /100
|
||||
/* If all of the results are below this match percent (when only one subject is searched due to
|
||||
* subject name matching) then all subjects are searched for answer */
|
||||
const minMatchToNotSearchOtherSubjects = 90
|
||||
|
@ -55,6 +55,14 @@ function getSubjNameWithoutYear(subjName: string): string {
|
|||
|
||||
// Not exported
|
||||
// ---------------------------------------------------------------------------------------------------------
|
||||
|
||||
function simplifyString(toremove) {
|
||||
return toremove
|
||||
.replace(/\s/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.toLowerCase()
|
||||
}
|
||||
|
||||
function removeStuff(
|
||||
value: string,
|
||||
removableStrings: Array<string>,
|
||||
|
@ -67,55 +75,49 @@ function removeStuff(
|
|||
return value
|
||||
}
|
||||
|
||||
// removes whitespace from begining and and, and replaces multiple spaces with one space
|
||||
function removeUnnecesarySpaces(toremove: string) {
|
||||
assert(toremove)
|
||||
|
||||
toremove = normalizeSpaces(toremove)
|
||||
while (toremove.includes(' ')) {
|
||||
toremove = toremove.replace(/ {2}/g, ' ')
|
||||
}
|
||||
return toremove.trim()
|
||||
}
|
||||
|
||||
// simplifies a string for easier comparison
|
||||
function simplifyStringForComparison(value: string) {
|
||||
assert(value)
|
||||
|
||||
value = removeUnnecesarySpaces(value).toLowerCase()
|
||||
return removeStuff(value, commonUselessStringParts)
|
||||
}
|
||||
|
||||
function removeSpecialChars(value: string) {
|
||||
assert(value)
|
||||
|
||||
return removeStuff(value, specialChars, ' ')
|
||||
}
|
||||
|
||||
// damn nonbreaking space
|
||||
function normalizeSpaces(input: string) {
|
||||
assert(input)
|
||||
|
||||
function normalizeSpaces(input) {
|
||||
return input.replace(/\s/g, ' ')
|
||||
}
|
||||
|
||||
function compareString(string1: string, string2: string) {
|
||||
if (!string1 || !string2) {
|
||||
if (!string1 && !string2) {
|
||||
function removeUnnecesarySpaces(toremove: string) {
|
||||
return normalizeSpaces(toremove).replace(/\s+/g, ' ')
|
||||
}
|
||||
|
||||
function compareString(s1, s2) {
|
||||
if (!s1 || !s2) {
|
||||
if (!s1 && !s2) {
|
||||
return 100
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
if (s1.length < 0 || s2.length < 0) {
|
||||
if (s1.length === 0 && s2.length === 0) {
|
||||
return 100
|
||||
} else {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
const s1 = simplifyStringForComparison(string1).split(' ')
|
||||
const s2 = simplifyStringForComparison(string2).split(' ')
|
||||
let match = 0
|
||||
for (let i = 0; i < s1.length; i++) {
|
||||
if (s2.includes(s1[i])) {
|
||||
let lastMatchIndex = -1
|
||||
let i = 0
|
||||
|
||||
while (i < s1.length) {
|
||||
if (match / i < magicNumber) {
|
||||
break
|
||||
}
|
||||
|
||||
const currMatchIndex = s2.indexOf(s1[i])
|
||||
if (lastMatchIndex < currMatchIndex) {
|
||||
match++
|
||||
lastMatchIndex = currMatchIndex
|
||||
}
|
||||
|
||||
i++
|
||||
}
|
||||
|
||||
let percent = Math.round(parseFloat(((match / s1.length) * 100).toFixed(2)))
|
||||
const lengthDifference = Math.abs(s2.length - s1.length)
|
||||
percent -= lengthDifference * lengthDiffMultiplier
|
||||
|
@ -163,7 +165,6 @@ function simplifyAnswer(value: string) {
|
|||
return value
|
||||
}
|
||||
return simplifyQA(value, [
|
||||
removeSpecialChars,
|
||||
removeUnnecesarySpaces,
|
||||
answerPreProcessor,
|
||||
removeAnswerLetters,
|
||||
|
@ -175,22 +176,16 @@ function simplifyQuestion(question: Question | string) {
|
|||
return
|
||||
}
|
||||
if (typeof question === 'string') {
|
||||
return simplifyQA(question, [
|
||||
removeSpecialChars,
|
||||
removeUnnecesarySpaces,
|
||||
removeAnswerLetters,
|
||||
])
|
||||
return simplifyQA(question, [removeUnnecesarySpaces, removeAnswerLetters])
|
||||
} else {
|
||||
if (question.Q) {
|
||||
question.Q = simplifyQA(question.Q, [
|
||||
removeSpecialChars,
|
||||
removeUnnecesarySpaces,
|
||||
removeAnswerLetters,
|
||||
])
|
||||
}
|
||||
if (question.A) {
|
||||
question.A = simplifyQA(question.A, [
|
||||
removeSpecialChars,
|
||||
removeUnnecesarySpaces,
|
||||
removeAnswerLetters,
|
||||
])
|
||||
|
@ -205,14 +200,30 @@ function simplifyQuestion(question: Question | string) {
|
|||
|
||||
function createQuestion(
|
||||
question: Question | string,
|
||||
answer: string,
|
||||
data: QuestionData
|
||||
answer?: string,
|
||||
data?: QuestionData
|
||||
): Question {
|
||||
try {
|
||||
if (typeof question === 'string') {
|
||||
return {
|
||||
Q: simplifyQuestion(question),
|
||||
A: answer ? simplifyAnswer(answer) : undefined,
|
||||
data: data,
|
||||
}
|
||||
} else {
|
||||
return {
|
||||
...question,
|
||||
cache: {
|
||||
Q: question.Q ? simplifyString(question.Q).split(' ') : [],
|
||||
A: question.A ? simplifyString(question.A).split(' ') : [],
|
||||
},
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
logger.Log('Error creating question', logger.GetColor('redbg'))
|
||||
console.error(question, answer, data)
|
||||
console.error(err)
|
||||
}
|
||||
}
|
||||
|
||||
function compareImage(data: QuestionData, data2: QuestionData) {
|
||||
|
@ -257,11 +268,11 @@ function compareData(q1: Question, q2: Question) {
|
|||
}
|
||||
|
||||
function compareQuestion(q1: Question, q2: Question) {
|
||||
return compareString(q1.Q, q2.Q)
|
||||
return compareString(q1.cache.Q, q2.cache.Q)
|
||||
}
|
||||
|
||||
function compareAnswer(q1: Question, q2: Question) {
|
||||
return compareString(q1.A, q2.A)
|
||||
return compareString(q1.cache.A, q2.cache.A)
|
||||
}
|
||||
|
||||
function compareQuestionObj(
|
||||
|
@ -328,7 +339,11 @@ function searchSubject(
|
|||
assert(question)
|
||||
|
||||
let result = []
|
||||
subj.Questions.every((currentQuestion) => {
|
||||
|
||||
let stopSearch = false
|
||||
let i = subj.Questions.length - 1
|
||||
while (i >= 0 && !stopSearch) {
|
||||
const currentQuestion = subj.Questions[i]
|
||||
const percent = compareQuestionObj(
|
||||
currentQuestion,
|
||||
subjName,
|
||||
|
@ -337,7 +352,7 @@ function searchSubject(
|
|||
question.data
|
||||
)
|
||||
|
||||
if (percent.avg > minMatchAmmount) {
|
||||
if (percent.avg >= minMatchAmmount) {
|
||||
result.push({
|
||||
q: currentQuestion,
|
||||
match: percent.avg,
|
||||
|
@ -346,11 +361,11 @@ function searchSubject(
|
|||
}
|
||||
|
||||
if (searchTillMatchPercent && percent.avg >= searchTillMatchPercent) {
|
||||
return false
|
||||
stopSearch = true
|
||||
}
|
||||
|
||||
return true
|
||||
})
|
||||
i--
|
||||
}
|
||||
|
||||
result = result.sort((q1, q2) => {
|
||||
if (q1.match < q2.match) {
|
||||
|
@ -421,9 +436,9 @@ function prepareQuestion(
|
|||
let preparedQuestion: Question
|
||||
|
||||
if (typeof question === 'object') {
|
||||
preparedQuestion = question
|
||||
preparedQuestion = createQuestion(question)
|
||||
} else {
|
||||
let parsedData
|
||||
let parsedData: any
|
||||
if (typeof data === 'string') {
|
||||
try {
|
||||
parsedData = JSON.parse(data)
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 49eae83f8194ab9585939b93119f82f7c0da16bb
|
||||
Subproject commit 7f4163736cc0bfed3259f39f7bc0063ca191da21
|
Loading…
Add table
Add a link
Reference in a new issue