clean content by special chars lines removal, add content word statistics
This commit is contained in:
parent
bd538bcee8
commit
3985b4ba33
|
@ -17,6 +17,7 @@ let headers = []
|
||||||
let tasksObjectsForJsonExport = []
|
let tasksObjectsForJsonExport = []
|
||||||
let headersByKind = {}
|
let headersByKind = {}
|
||||||
let writeJsonAfterParse = false;
|
let writeJsonAfterParse = false;
|
||||||
|
writeJsonAfterParse = true;
|
||||||
|
|
||||||
/**************************************************************
|
/**************************************************************
|
||||||
* fetch the source orgmode file to read its contents
|
* fetch the source orgmode file to read its contents
|
||||||
|
@ -42,10 +43,16 @@ fs.stat(sourceFilePath, function (err, stat) {
|
||||||
* search elements
|
* search elements
|
||||||
*********************/
|
*********************/
|
||||||
let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING'];
|
let stateKeywordList = ['SOMEDAY', 'NEXT', 'TODO', 'CANCELLED', 'DONE', 'WAITING'];
|
||||||
let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED','Refiled'];
|
let dateKeywordList = ['CREATED', 'SCHEDULED', 'DEADLINE', 'CLOSED', 'Refiled'];
|
||||||
let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END'];
|
let sectionKeywordList = ['PROPERTIES', 'LOGBOOK', 'END'];
|
||||||
let propertiesSection = {}
|
|
||||||
let logBookSection = {}
|
let propertiesSection = {} // TODO properties listing
|
||||||
|
let logBookSection = {} // TODO logbook listing
|
||||||
|
|
||||||
|
let statistics = {
|
||||||
|
tags: {},
|
||||||
|
words: {}
|
||||||
|
}
|
||||||
|
|
||||||
let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
|
let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
|
||||||
/**
|
/**
|
||||||
|
@ -55,7 +62,7 @@ let headerKeywordSearch = '[' + stateKeywordList.join('|') + ']'
|
||||||
let task = {
|
let task = {
|
||||||
header: "",
|
header: "",
|
||||||
level: "",
|
level: "",
|
||||||
content: "",
|
corpus: "",
|
||||||
state: "",
|
state: "",
|
||||||
tags: [],
|
tags: [],
|
||||||
tagsInherited: [],
|
tagsInherited: [],
|
||||||
|
@ -63,17 +70,39 @@ let task = {
|
||||||
logbook: {},
|
logbook: {},
|
||||||
properties: {},
|
properties: {},
|
||||||
}
|
}
|
||||||
// init first task object as empty clone
|
|
||||||
let currentTask = {...task};
|
|
||||||
let isHeader = false;
|
let isHeader = false;
|
||||||
let isProperty = false;
|
let isProperty = false;
|
||||||
let isLogbook = false;
|
let isLogbook = false;
|
||||||
let isFirst = true;
|
let isFirst = true;
|
||||||
|
|
||||||
|
// init first task object as empty clone
|
||||||
|
let currentTask = {...task};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* add to tasks to export and refresh current task
|
||||||
|
*/
|
||||||
|
function addAndRefreshCurrentTask() {
|
||||||
|
tasksObjectsForJsonExport.push(currentTask)
|
||||||
|
currentTask = {...task};
|
||||||
|
currentTask.dates = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
function makeWordsStatistics(sentence) {
|
||||||
|
sentence.split(' ')?.forEach(word => {
|
||||||
|
if (!statistics.words[word]) {
|
||||||
|
statistics.words[word] = 0
|
||||||
|
}
|
||||||
|
statistics.words[word]++
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
/**********************
|
/**********************
|
||||||
* loop to parse all
|
* loop to parse all
|
||||||
*********************/
|
*********************/
|
||||||
fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
|
|
||||||
|
|
||||||
if (err) {
|
if (err) {
|
||||||
return console.log(err);
|
return console.log(err);
|
||||||
}
|
}
|
||||||
|
@ -90,10 +119,8 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
if (line.match(/^\*+? /)) {
|
if (line.match(/^\*+? /)) {
|
||||||
// add last task to export list
|
// add last task to export list
|
||||||
if (!isFirst) {
|
if (!isFirst) {
|
||||||
tasksObjectsForJsonExport.push(currentTask)
|
|
||||||
|
|
||||||
console.log('currentTask.dates', currentTask.dates)
|
addAndRefreshCurrentTask();
|
||||||
currentTask = {...task};
|
|
||||||
} else {
|
} else {
|
||||||
isFirst = false;
|
isFirst = false;
|
||||||
}
|
}
|
||||||
|
@ -105,11 +132,9 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
// create a new task
|
// create a new task
|
||||||
|
|
||||||
|
|
||||||
line = line.replace('*', '')
|
headers.push(cleanHeader(line))
|
||||||
line = line.replace(stateKeywordList, [].fill('', 0, stateKeywordList.length))
|
currentTask.header = cleanHeader(line);
|
||||||
|
makeWordsStatistics(cleanHeader(line));
|
||||||
headers.push(line)
|
|
||||||
currentTask.header = line;
|
|
||||||
stateKeywordList.forEach(keyword => {
|
stateKeywordList.forEach(keyword => {
|
||||||
let keywordIsFound = lineHasKeyword(line, keyword)
|
let keywordIsFound = lineHasKeyword(line, keyword)
|
||||||
|
|
||||||
|
@ -123,21 +148,29 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
let tagsFound = line.match(/\:(.*)\:/g)
|
let tagsFound = line.match(/\:(.*)\:/g)
|
||||||
if (tagsFound) {
|
if (tagsFound) {
|
||||||
tagsFound = tagsFound[0];
|
tagsFound = tagsFound[0];
|
||||||
console.log('tagsFound', tagsFound)
|
let tagList = tagsFound.split(':');
|
||||||
tagsFound = tagsFound.split(':').filter(item => item.length)
|
tagList?.forEach(tag => {
|
||||||
currentTask.tags = tagsFound;
|
if (tag.length > 1) {
|
||||||
|
|
||||||
|
if (!statistics.tags[tag]) {
|
||||||
|
statistics.tags[tag] = 0
|
||||||
|
}
|
||||||
|
statistics.tags[tag]++
|
||||||
|
|
||||||
|
currentTask.tags.push(tag)
|
||||||
|
}
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// fin des recherches dans la ligne de Header
|
// ------------- fin des recherches dans la ligne de Header -------------
|
||||||
} else {
|
} else {
|
||||||
isHeader = false;
|
isHeader = false;
|
||||||
}
|
}
|
||||||
// examen des lignes de corps de tâche, ou de corps de section suite au header.
|
// examen des lignes de corps de tâche, ou de corps de section suite au header.
|
||||||
|
|
||||||
// classer les dates de création, cloture, et de logbook
|
// classer les dates de création, cloture, et de logbook
|
||||||
let dateFound = searchDate(line)
|
let dateFound = searchDate(line)
|
||||||
if(dateFound){
|
if (dateFound) {
|
||||||
|
|
||||||
dateKeywordList.forEach(keyword => {
|
dateKeywordList.forEach(keyword => {
|
||||||
if (lineHasSubstring(line, keyword)) {
|
if (lineHasSubstring(line, keyword)) {
|
||||||
|
@ -149,31 +182,38 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
// console.log('keyword', keyword)
|
// console.log('keyword', keyword)
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
} else {
|
||||||
|
|
||||||
|
if (line.indexOf(dateKeywordList) !== -1 && line.indexOf(stateKeywordList) !== -1 && line.indexOf(sectionKeywordList) !== -1) {
|
||||||
|
|
||||||
|
makeWordsStatistics(line)
|
||||||
// ajouter le corps complet de la section après le header
|
// ajouter le corps complet de la section après le header
|
||||||
if (line.length && !isHeader) {
|
if (line.length && !isHeader) {
|
||||||
|
|
||||||
let cleanedLine = line.replace(/\s\s/g, ' ')
|
let cleanedLine = line.replace(/\s\s/g, ' ');
|
||||||
cleanedLine = line.replace(/ {2,}/g, ' ')
|
cleanedLine = line.replace(/ {2,}/g, ' ')
|
||||||
console.log('line', cleanedLine)
|
|
||||||
currentTask.corpus += `
|
|
||||||
` + cleanedLine;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
currentTask.corpus += `${cleanedLine}
|
||||||
|
`
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
// ajouter la dernière tâche parsée
|
// ajouter la dernière tâche parsée
|
||||||
tasksObjectsForJsonExport.push(currentTask)
|
addAndRefreshCurrentTask();
|
||||||
|
|
||||||
console.log('headers', headers)
|
|
||||||
console.log(" parsing fini")
|
console.log(" parsing fini")
|
||||||
stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
|
// stateKeywordList.forEach(keyword => console.log('nombre de headers', keyword, headersByKind[keyword]?.length))
|
||||||
|
|
||||||
|
|
||||||
const jsonContent = {
|
const jsonContent = {
|
||||||
statistics: {
|
statistics: {
|
||||||
lines_count: everyline.length,
|
lines_count: everyline.length,
|
||||||
headers_count: headers.length,
|
headers_count: headers.length,
|
||||||
|
statistics: Object.keys(statistics).sort(function (a, b) {
|
||||||
|
return statistics[a] - statistics[b]
|
||||||
|
})
|
||||||
|
|
||||||
},
|
},
|
||||||
meta_data: {
|
meta_data: {
|
||||||
author: '@tykayn@mastodon.Cipherbliss.com',
|
author: '@tykayn@mastodon.Cipherbliss.com',
|
||||||
|
@ -183,13 +223,14 @@ fs.readFile(sourceFilePath, 'utf8', function (err, data) {
|
||||||
},
|
},
|
||||||
tasks_list: tasksObjectsForJsonExport
|
tasks_list: tasksObjectsForJsonExport
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log('statistics', statistics)
|
||||||
// console.log('tasksObjectsForJsonExport', jsonContent)
|
// console.log('tasksObjectsForJsonExport', jsonContent)
|
||||||
|
|
||||||
if (writeJsonAfterParse) {
|
if (writeJsonAfterParse) {
|
||||||
writeJsonFile('export_' + sourceFileName + '.json', JSON.stringify(jsonContent));
|
writeJsonFile('export_' + sourceFileName + '_parsed.json', JSON.stringify(jsonContent));
|
||||||
}
|
}
|
||||||
|
|
||||||
return;
|
|
||||||
})
|
})
|
||||||
|
|
||||||
function lineHasKeyword(line, keyword = 'TODO') {
|
function lineHasKeyword(line, keyword = 'TODO') {
|
||||||
|
@ -198,13 +239,22 @@ function lineHasKeyword(line, keyword = 'TODO') {
|
||||||
if (isFound) {
|
if (isFound) {
|
||||||
createNewHeaderKind(keyword)
|
createNewHeaderKind(keyword)
|
||||||
headersByKind[keyword].push(line);
|
headersByKind[keyword].push(line);
|
||||||
|
if (!statistics[keyword]) {
|
||||||
|
statistics[keyword] = 0
|
||||||
|
}
|
||||||
|
statistics[keyword]++
|
||||||
}
|
}
|
||||||
return isFound;
|
return isFound;
|
||||||
}
|
}
|
||||||
|
|
||||||
function lineHasSubstring(line, keyword) {
|
function lineHasSubstring(line, keyword) {
|
||||||
|
let isFound = (line.indexOf(keyword) !== -1)
|
||||||
|
if (!statistics[keyword]) {
|
||||||
|
statistics[keyword] = 0
|
||||||
|
}
|
||||||
|
statistics[keyword]++
|
||||||
|
|
||||||
return (line.indexOf(keyword) !== -1)
|
return isFound
|
||||||
}
|
}
|
||||||
|
|
||||||
function createNewHeaderKind(keyword) {
|
function createNewHeaderKind(keyword) {
|
||||||
|
@ -226,14 +276,14 @@ function searchDate(line) {
|
||||||
let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/)
|
let simpleDayHour = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}/)
|
||||||
let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/)
|
let simpleDayHourSec = line.match(/\d{4}\-\d{2}\-\d{2} \w{3}?\.? \d{2}\:\d{2}\:\d{2}/)
|
||||||
|
|
||||||
if(simpleDayHourSec){
|
if (simpleDayHourSec) {
|
||||||
return simpleDayHourSec;
|
return simpleDayHourSec;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(simpleDayHour){
|
if (simpleDayHour) {
|
||||||
return simpleDayHour;
|
return simpleDayHour;
|
||||||
}
|
}
|
||||||
if(simpleDay){
|
if (simpleDay) {
|
||||||
return simpleDay;
|
return simpleDay;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -249,6 +299,23 @@ function compareDatesAndKeepOldest(date1, date2) {
|
||||||
date2 = moment(date2)
|
date2 = moment(date2)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* get the cleaned content of the header
|
||||||
|
* @param line
|
||||||
|
*/
|
||||||
|
function cleanHeader(line) {
|
||||||
|
|
||||||
|
line = '' + line;
|
||||||
|
stateKeywordList.forEach(keyword => {
|
||||||
|
line = line.replace(keyword, '')
|
||||||
|
})
|
||||||
|
line = line.replace(/\** /, '');
|
||||||
|
line = line.replace(/\[.*\]/g, '');
|
||||||
|
line = line.replace(/\:.*\:/g, '');
|
||||||
|
line = line.replace(' ', '');
|
||||||
|
return line.trim();
|
||||||
|
}
|
||||||
|
|
||||||
function writeJsonFile(fileName, fileContent) {
|
function writeJsonFile(fileName, fileContent) {
|
||||||
console.log('write file ', fileName);
|
console.log('write file ', fileName);
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue