clean content by special chars lines removal, add content word statistics

2023-03-06 00:01:12 +01:00 · 2023-03-06 00:01:12 +01:00 · 3985b4ba33
parent bd538bcee8
commit 3985b4ba33
1 changed files with 115 additions and 48 deletions
--- a/parse_orgmode_to_json.mjs
+++ b/parse_orgmode_to_json.mjs
@ -17,6 +17,7 @@ let headers = []
 let tasksObjectsForJsonExport = []
 let headersByKind = {}
 let writeJsonAfterParse = false;
+writeJsonAfterParse = true;

 /**************************************************************
 * fetch the source orgmode file to read its contents
@ -42,10 +43,16 @@ fs.stat(sourceFilePath, function (err, stat) {
 * search elements
 *********************/
 let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING'];
-let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED','Refiled'];
+let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED', 'Refiled'];
 let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END'];
-let propertiesSection = {}
-let logBookSection = {}
+
+let propertiesSection = {} // TODO properties listing
+let logBookSection = {} // TODO logbook listing
+
+let statistics = {
+    tags: {},
+    words: {}
+}

 let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
 /**
@ -55,7 +62,7 @@ let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
 let task = {
    header: "",
    level: "",
-    content: "",
+    corpus: "",
    state: "",
    tags: [],
    tagsInherited: [],
@ -63,17 +70,39 @@ let task = {
    logbook: {},
    properties: {},
 }
-// init first task object as empty clone
-let currentTask = {...task};
+
 let isHeader = false;
 let isProperty = false;
 let isLogbook = false;
 let isFirst = true;

+// init first task object as empty clone
+let currentTask = {...task};
+
+/**
+ * add to tasks to export and refresh current task
+ */
+function addAndRefreshCurrentTask() {
+    tasksObjectsForJsonExport.push(currentTask)
+    currentTask = {...task};
+    currentTask.dates = {};
+};
+
+function makeWordsStatistics(sentence) {
+    sentence.split(' ')?.forEach(word => {
+        if (!statistics.words[word]) {
+            statistics.words[word] = 0
+        }
+        statistics.words[word]++
+    })
+}
+
 /**********************
 * loop to parse all
 *********************/
 fs.readFile(sourceFilePath, 'utf8', function (err, data) {
+
+
    if (err) {
        return console.log(err);
    }
@ -90,10 +119,8 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
        if (line.match(/^\*+? /)) {
            // add last task to export list
            if (!isFirst) {
-                tasksObjectsForJsonExport.push(currentTask)

-                console.log('currentTask.dates', currentTask.dates)
-                currentTask = {...task};
+                addAndRefreshCurrentTask();
            } else {
                isFirst = false;
            }
@ -105,11 +132,9 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
            // create a new task


-            line = line.replace('*', '')
-            line = line.replace(stateKeywordList, [].fill('', 0, stateKeywordList.length))
-
-            headers.push(line)
-            currentTask.header = line;
+            headers.push(cleanHeader(line))
+            currentTask.header = cleanHeader(line);
+            makeWordsStatistics(cleanHeader(line));
            stateKeywordList.forEach(keyword => {
                let keywordIsFound = lineHasKeyword(line, keyword)

@ -123,57 +148,72 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
            let tagsFound = line.match(/\:(.*)\:/g)
            if (tagsFound) {
                tagsFound = tagsFound[0];
-                console.log('tagsFound', tagsFound)
-                tagsFound = tagsFound.split(':').filter(item => item.length)
-                currentTask.tags = tagsFound;
+                let tagList = tagsFound.split(':');
+                tagList?.forEach(tag => {
+                    if (tag.length > 1) {
+
+                        if (!statistics.tags[tag]) {
+                            statistics.tags[tag] = 0
+                        }
+                        statistics.tags[tag]++
+
+                        currentTask.tags.push(tag)
+                    }
+                })
            }


-            // fin des recherches dans la ligne de Header
+            // ------------- fin des recherches dans la ligne de Header -------------
        } else {
            isHeader = false;
        }
        // examen des lignes de corps de tâche, ou de corps de section suite au header.
-
        // classer les dates de création, cloture, et de logbook
        let dateFound = searchDate(line)
-        if(dateFound){
+        if (dateFound) {

-        dateKeywordList.forEach(keyword => {
-            if (lineHasSubstring(line, keyword)) {
-                if (!currentTask.dates[keyword]) {
-                    currentTask.dates[keyword] = '';
+            dateKeywordList.forEach(keyword => {
+                if (lineHasSubstring(line, keyword)) {
+                    if (!currentTask.dates[keyword]) {
+                        currentTask.dates[keyword] = '';
+                    }
+                    currentTask.dates[keyword] = new Date(dateFound[0]);
+                } else {
+                    // console.log('keyword', keyword)
+                }
+            })
+        } else {
+
+            if (line.indexOf(dateKeywordList) !== -1 && line.indexOf(stateKeywordList) !== -1 && line.indexOf(sectionKeywordList) !== -1) {
+
+                makeWordsStatistics(line)
+                // ajouter le corps complet de la section après le header
+                if (line.length && !isHeader) {
+
+                    let cleanedLine = line.replace(/\s\s/g, ' ');
+                    cleanedLine = line.replace(/ {2,}/g, ' ')
+
+                    currentTask.corpus += `${cleanedLine}
+`
                }
-                currentTask.dates[keyword] = new Date(dateFound[0]);
-            } else {
-                // console.log('keyword', keyword)
            }
-        })
        }
-
-        // ajouter le corps complet de la section après le header
-        if (line.length && !isHeader) {
-
-            let cleanedLine = line.replace(/\s\s/g, ' ')
-            cleanedLine = line.replace(/ {2,}/g, ' ')
-            console.log('line', cleanedLine)
-            currentTask.corpus += `
- ` + cleanedLine;
-        }
-
    })
    // ajouter la dernière tâche parsée
-    tasksObjectsForJsonExport.push(currentTask)
+    addAndRefreshCurrentTask();

-    console.log('headers', headers)
    console.log(" parsing fini")
-    stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
+    // stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))


    const jsonContent = {
        statistics: {
            lines_count: everyline.length,
            headers_count: headers.length,
+            statistics: Object.keys(statistics).sort(function (a, b) {
+                return statistics[a] - statistics[b]
+            })
+
        },
        meta_data: {
            author: '@tykayn@mastodon.Cipherbliss.com',
@ -183,13 +223,14 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
        },
        tasks_list: tasksObjectsForJsonExport
    }
+
+    console.log('statistics', statistics)
    // console.log('tasksObjectsForJsonExport', jsonContent)

    if (writeJsonAfterParse) {
-        writeJsonFile('export_' + sourceFileName + '.json', JSON.stringify(jsonContent));
+        writeJsonFile('export_' + sourceFileName + '_parsed.json', JSON.stringify(jsonContent));
    }

-    return;
 })

 function lineHasKeyword(line, keyword = 'TODO') {
@ -198,13 +239,22 @@ function lineHasKeyword(line, keyword = 'TODO') {
    if (isFound) {
        createNewHeaderKind(keyword)
        headersByKind[keyword].push(line);
+        if (!statistics[keyword]) {
+            statistics[keyword] = 0
+        }
+        statistics[keyword]++
    }
    return isFound;
 }

 function lineHasSubstring(line, keyword) {
+    let isFound = (line.indexOf(keyword) !== -1)
+    if (!statistics[keyword]) {
+        statistics[keyword] = 0
+    }
+    statistics[keyword]++

-    return (line.indexOf(keyword) !== -1)
+    return isFound
 }

 function createNewHeaderKind(keyword) {
@ -226,14 +276,14 @@ function searchDate(line) {
    let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/)
    let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/)

-    if(simpleDayHourSec){
+    if (simpleDayHourSec) {
        return simpleDayHourSec;
    }

-    if(simpleDayHour){
+    if (simpleDayHour) {
        return simpleDayHour;
    }
-    if(simpleDay){
+    if (simpleDay) {
        return simpleDay;
    }

@ -249,6 +299,23 @@ function compareDatesAndKeepOldest(date1, date2) {
    date2 = moment(date2)
 }

+/**
+ * get the cleaned content of the header
+ * @param line
+ */
+function cleanHeader(line) {
+
+    line = '' + line;
+    stateKeywordList.forEach(keyword => {
+        line = line.replace(keyword, '')
+    })
+    line = line.replace(/\** /, '');
+    line = line.replace(/\[.*\]/g, '');
+    line = line.replace(/\:.*\:/g, '');
+    line = line.replace('  ', '');
+    return line.trim();
+}
+
 function writeJsonFile(fileName, fileContent) {
    console.log('write file ', fileName);