clean content by special chars lines removal, add content word statistics

2023-03-06 00:01:12 +01:00 · 2023-03-06 00:01:12 +01:00 · 3985b4ba33
parent bd538bcee8
commit 3985b4ba33
1 changed files with 115 additions and 48 deletions
--- a/parse_orgmode_to_json.mjs
+++ b/parse_orgmode_to_json.mjs
@ -17,6 +17,7 @@ let headers = []
 let tasksObjectsForJsonExport = []
 let headersByKind = {}
 let writeJsonAfterParse = false;
 writeJsonAfterParse = true;
 /**************************************************************
 * fetch the source orgmode file to read its contents
@ -42,10 +43,16 @@ fs.stat(sourceFilePath, function (err, stat) {
 * search elements
 *********************/
 let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING'];
-let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED','Refiled'];
+let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED', 'Refiled'];
 let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END'];
-let propertiesSection = {}
+
-let logBookSection = {}
+let propertiesSection = {} // TODO properties listing
 let logBookSection = {} // TODO logbook listing
 let statistics = {
    tags: {},
    words: {}
 }
 let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
 /**
@ -55,7 +62,7 @@ let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
 let task = {
    header: "",
    level: "",
-    content: "",
+    corpus: "",
    state: "",
    tags: [],
    tagsInherited: [],
@ -63,17 +70,39 @@ let task = {
    logbook: {},
    properties: {},
 }
-// init first task object as empty clone
+
 let currentTask = {...task};
 let isHeader = false;
 let isProperty = false;
 let isLogbook = false;
 let isFirst = true;
 // init first task object as empty clone
 let currentTask = {...task};
 /**
 * add to tasks to export and refresh current task
 */
 function addAndRefreshCurrentTask() {
    tasksObjectsForJsonExport.push(currentTask)
    currentTask = {...task};
    currentTask.dates = {};
 };
 function makeWordsStatistics(sentence) {
    sentence.split(' ')?.forEach(word => {
        if (!statistics.words[word]) {
            statistics.words[word] = 0
        }
        statistics.words[word]++
    })
 }
 /**********************
 * loop to parse all
 *********************/
 fs.readFile(sourceFilePath, 'utf8', function (err, data) {
    if (err) {
        return console.log(err);
    }
@ -90,10 +119,8 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
        if (line.match(/^\*+? /)) {
            // add last task to export list
            if (!isFirst) {
                tasksObjectsForJsonExport.push(currentTask)
-                console.log('currentTask.dates', currentTask.dates)
+                addAndRefreshCurrentTask();
                currentTask = {...task};
            } else {
                isFirst = false;
            }
@ -105,11 +132,9 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
            // create a new task
-            line = line.replace('*', '')
+            headers.push(cleanHeader(line))
-            line = line.replace(stateKeywordList, [].fill('', 0, stateKeywordList.length))
+            currentTask.header = cleanHeader(line);
-
+            makeWordsStatistics(cleanHeader(line));
            headers.push(line)
            currentTask.header = line;
            stateKeywordList.forEach(keyword => {
                let keywordIsFound = lineHasKeyword(line, keyword)
@ -123,21 +148,29 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
            let tagsFound = line.match(/\:(.*)\:/g)
            if (tagsFound) {
                tagsFound = tagsFound[0];
-                console.log('tagsFound', tagsFound)
+                let tagList = tagsFound.split(':');
-                tagsFound = tagsFound.split(':').filter(item => item.length)
+                tagList?.forEach(tag => {
-                currentTask.tags = tagsFound;
+                    if (tag.length > 1) {
                        if (!statistics.tags[tag]) {
                            statistics.tags[tag] = 0
                        }
                        statistics.tags[tag]++
                        currentTask.tags.push(tag)
                    }
                })
            }
-            // fin des recherches dans la ligne de Header
+            // ------------- fin des recherches dans la ligne de Header -------------
        } else {
            isHeader = false;
        }
        // examen des lignes de corps de tâche, ou de corps de section suite au header.
        // classer les dates de création, cloture, et de logbook
        let dateFound = searchDate(line)
-        if(dateFound){
+        if (dateFound) {
            dateKeywordList.forEach(keyword => {
                if (lineHasSubstring(line, keyword)) {
@ -149,31 +182,38 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
                    // console.log('keyword', keyword)
                }
            })
-        }
+        } else {
            if (line.indexOf(dateKeywordList) !== -1 && line.indexOf(stateKeywordList) !== -1 && line.indexOf(sectionKeywordList) !== -1) {
                makeWordsStatistics(line)
                // ajouter le corps complet de la section après le header
                if (line.length && !isHeader) {
-            let cleanedLine = line.replace(/\s\s/g, ' ')
+                    let cleanedLine = line.replace(/\s\s/g, ' ');
                    cleanedLine = line.replace(/ {2,}/g, ' ')
            console.log('line', cleanedLine)
            currentTask.corpus += `
 ` + cleanedLine;
        }
                    currentTask.corpus += `${cleanedLine}
 `
                }
            }
        }
    })
    // ajouter la dernière tâche parsée
-    tasksObjectsForJsonExport.push(currentTask)
+    addAndRefreshCurrentTask();
    console.log('headers', headers)
    console.log(" parsing fini")
-    stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
+    // stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
    const jsonContent = {
        statistics: {
            lines_count: everyline.length,
            headers_count: headers.length,
            statistics: Object.keys(statistics).sort(function (a, b) {
                return statistics[a] - statistics[b]
            })
        },
        meta_data: {
            author: '@tykayn@mastodon.Cipherbliss.com',
@ -183,13 +223,14 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
        },
        tasks_list: tasksObjectsForJsonExport
    }
    console.log('statistics', statistics)
    // console.log('tasksObjectsForJsonExport', jsonContent)
    if (writeJsonAfterParse) {
-        writeJsonFile('export_' + sourceFileName + '.json', JSON.stringify(jsonContent));
+        writeJsonFile('export_' + sourceFileName + '_parsed.json', JSON.stringify(jsonContent));
    }
    return;
 })
 function lineHasKeyword(line, keyword = 'TODO') {
@ -198,13 +239,22 @@ function lineHasKeyword(line, keyword = 'TODO') {
    if (isFound) {
        createNewHeaderKind(keyword)
        headersByKind[keyword].push(line);
        if (!statistics[keyword]) {
            statistics[keyword] = 0
        }
        statistics[keyword]++
    }
    return isFound;
 }
 function lineHasSubstring(line, keyword) {
    let isFound = (line.indexOf(keyword) !== -1)
    if (!statistics[keyword]) {
        statistics[keyword] = 0
    }
    statistics[keyword]++
-    return (line.indexOf(keyword) !== -1)
+    return isFound
 }
 function createNewHeaderKind(keyword) {
@ -226,14 +276,14 @@ function searchDate(line) {
    let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/)
    let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/)
-    if(simpleDayHourSec){
+    if (simpleDayHourSec) {
        return simpleDayHourSec;
    }
-    if(simpleDayHour){
+    if (simpleDayHour) {
        return simpleDayHour;
    }
-    if(simpleDay){
+    if (simpleDay) {
        return simpleDay;
    }
@ -249,6 +299,23 @@ function compareDatesAndKeepOldest(date1, date2) {
    date2 = moment(date2)
 }
 /**
 * get the cleaned content of the header
 * @param line
 */
 function cleanHeader(line) {
    line = '' + line;
    stateKeywordList.forEach(keyword => {
        line = line.replace(keyword, '')
    })
    line = line.replace(/\** /, '');
    line = line.replace(/\[.*\]/g, '');
    line = line.replace(/\:.*\:/g, '');
    line = line.replace('  ', '');
    return line.trim();
 }
 function writeJsonFile(fileName, fileContent) {
    console.log('write file ', fileName);