txt2html

Converts plaintext to HTML
git clone git://src.gearsix.net/txt2html
Log | Files | Refs | Atom | README

commit 55a3a9250ecd073d919b0180f21397b240158fa8
Author: GeaRSiX <gearsix@tuta.io>
Date:   Sun, 20 Jun 2021 17:12:07 +0100

implemented skeleton for AST & paragraphs

Diffstat:
AREADME | 13+++++++++++++
ATODO | 10++++++++++
Atest.txt | 97+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Atxt2html.c | 144+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 264 insertions(+), 0 deletions(-)

diff --git a/README b/README @@ -0,0 +1,13 @@ + +txt2html +======== + +txt2html is a tool for converting text files to HTML. + +The goal is to be simple and not to create set of syntax rules for another markup language but take conventions commonly found in .txt files and convert them to HTML. + +You could probably use a cmark (https://github.com/commonmark/cmark) or another markdown parser to achieve the same goal but this was written as an educational project in writing an AST in C. + +authors +------- +- gearsix diff --git a/TODO b/TODO @@ -0,0 +1,10 @@ +- rename writeP +- implement ol +- implement ul +- implement li +- implement h1 +- implement h2 +- test ast against just writing straight to buffer +- avoid recursion (unless needed) - slow +- test against test.txt, improve test.txt for catching edge-cases +- maybe rename to NAML (not another markup language) diff --git a/test.txt b/test.txt @@ -0,0 +1,97 @@ +generated using https://jaspervdj.be/lorem-markdownum/ + +Nec rumpunt nacta +================= + +Natus advena venit ipse cornua flumina +-------------------------------------- + +Lorem markdownum concurrere; tot eodem nulli: tamen hunc supplex das. Sustineat +parens, mox unda nec vix instrumenta, caeruleus arcisque. Hac mundi fessos date +fecit Dicta madefacta oscula condiderat Gaudia Latous terra si iuris: medio: +per. Et dea vulnus ostendit pronus, docuique vel Luna consumite tibi nobis. Huic +tale et si altae caeleste: cur flectentem Lethaei consilioque mediis. + +1. Sacris unda ad aquis celebrare movensque spectes +2. Suppressis laniaverat terror +3. Nec medendi neque insolida +4. Iam ut everti nactusque tamquam et cetera +5. Paternos murmurat dixit + +Artus fors ulnas tuebere. Est quod fecit curva latentem ostendit! Orsa quid dixi +sui, pendebat acres, vero Minervam iube artis et in. Sibi aevum erit est misit +lina qui. + +Noverit bis Saturno ora variarum notent concipit +------------------------------------------------ + +Reddat error; es videt. Lacrimis matris, sua isse summa statione dispositam +negat viridique et faces flammas, lacertis me. + +Horae protinus femineam genuit certe docebo silentum cuspide capitis, rogabo +rarissima vitis, resupinoque. Parantem mollia longis grates Pallante, me funesto +dixit incipiens petens. Sit novena, curru Cyanen tandem concretam; edidit +traiectus facta. + +* Desere erexit +* Amor vates Oenides succiso rescindere in amorem +* Anhelo ante genu crepitantia ponit tui socii +* Mater mundi silentia expellere et piget magnorum +* Corpora aethera armatur femina + +Ora tempora pabula +------------------ + +Poteramus dente? Triplex ultra nec quoque, vix, et amore. Per fratres intres +domus, regnare ignota nemorum dimittit quodsi curvamine Castrumque: fidem? Mihi +praecipuum Creten tenendae, iussum in viam, sensisse audebatis titulum in rigidi +suae, tum. + +- Passosque nostri tenetur +- Robustior nostras acta amantem albentibus mora +- Simul superabat aequore Rhamnusidis nidor nostro + +Fasque multae posset caelestibus suaque in et mihi nunc similis fatale fuerunt. +Longa vestigia luminis spargit, cum inpediique, vel damno bina quadrupes ope me! +Circumdata abesse carinae: inficere ille tam nimis nihil haec iacuit satis, quae +molles desuetudine dixit, noctes. Quamvis superest vices suus ferentes dixit, +ore post solent iunctis. Forma mihi sub negata visura. + +Exhalat voce +------------ + +Qui furto dare illud pars, Romana bene dolore medias deducit destinat et lucos +duo strigis, a. Ut illo oportuit, qui tu nunc lactens refert anxia dixi: Peleus +pro. Icta brevis. Meum atris plumis finitque at Elymumque moras per Ino +hominumque relictus legumque detulit quisquis optet nupta vultu: pro veni. +Laetitia liceat. + +Pennis unam praesens separat lacrimaeque prior sideribus et equos. Uno sola +illum, quam procos precibus obliquo aduncae ad excipit, redditur dignissime +corvus harundinibus limoso. Et ac cacumine vosne. Vidit his, te et secura licet: +adsueta superum nunc vixque neque sumere infans lacusque incubat! + +a. Ima ad sustinet iuratus Antium +b. Fama tuo aere orbis +c. Perterrita temone contingere Hippotadaeque inmanis decipit imitata +d. Tollit cervicibus aede magnanimi atque ostendens noxae +e. Crudeles poterunt credensque Ulixem videre + +Munera dextra cur agmen posuistis quoscumque opem +------------------------------------------------- + +Tenus monstri tamen, litem taurus pande auxiliaribus Creteque habet mollia +error; nec. Qui omnia qui, in hic genitor factus origine nuper sanguine +ferventibus. + +1.a. Abeunt quos templa +2.a. Strata si questaque sibi claro multicavo accipiter +3.a. Et dicta Piscibus taurus +4.a. Inque Romani +5.a. Desistunt et enim + +Sinistra fallere: valuere: hos patet ita, propriaque. Nemus factus nam videtur +sponsi. Alas non perque sagitta creverunt totumque turbine nec lilia populo +vesci hac frena sui membris ne postes haut notam digiti! Antiphates flexi +videnda colebat caelo lac pars laedar! Telo coniuge spectas est modo rapit +dolens Arcesius nobilis principiis video nec iterum caerula. diff --git a/txt2html.c b/txt2html.c @@ -0,0 +1,144 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdint.h> +#include <ctype.h> // replace with utf8 support + +// config +#define OPT_HB 0x01 + +// node tags +#define OPEN 0x10 +#define CLOSE 0x20 +#define P 0x01 +#define BR 0x02 + +struct node { + struct node *prev, *next; + uint8_t type; + char *buf; +}; + +void writeP(struct node *n, int c); +struct node *txt2html(const char *txt); +struct node *next(struct node *prev, uint8_t tag, struct node *n); + +const uint8_t opts = OPT_HB; + +int main(int argc, char **argv) +{ + const char *text = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\naaaaaaa\n\naaaaaa"; + char *html = malloc(4062); + + struct node *n = txt2html(text); + while(n != NULL) { + if (n->buf != NULL) + printf(n->buf); + n = n->next; + } + puts(""); + + return EXIT_SUCCESS; +} + +struct node *txt2html(const char *txt) +{ + struct node *n = malloc(50 * sizeof(struct node)); + + int c; + unsigned int i = 0; + const size_t len = strlen(txt); + + while (c != EOF) { + c = (i < len) ? txt[i] : EOF; + + switch (n->type) { + case P: + if (c == EOF) + n = next(n, CLOSE+P, n+1); + else if (c == '\n' && txt[i+1] == '\n') { + ++i; + n = next(n, CLOSE+P, n+1); + } else if (c == '\n') { + if (opts & OPT_HB) { + n = next(n, OPEN+BR+CLOSE, n+1); + n = next(n, P, n+1); + } else writeP(n, ' '); + } else writeP(n, c); + break; + case 0: + default: + if (isprint(c) || c == '\t') { + if (n->prev == NULL || n->type == CLOSE+P) { + n = next(n, OPEN+P, n+1); + n = next(n, P, n+1); + } + writeP(n, c); + } + break; + } + + ++i; + } + + while (n->prev != NULL) + n = n->prev; + return n; +} + +struct node *next(struct node *prev, uint8_t tag, struct node *n) +{ + prev->next = n; + n->prev = prev; + n->type = tag; + switch(tag) { + case OPEN+P: + n->buf = malloc(4); + strncat(n->buf, "<p>", 4); + break; + case CLOSE+P: + if (prev->type == P) + writeP(prev, EOF); + n->buf = malloc(5); + strncat(n->buf, "</p>", 5); + break; + case OPEN+BR+CLOSE: + if (prev->type == P) + writeP(prev, EOF); + n->buf = malloc(6); + strncat(n->buf, "<br/>", 6); + break; + default: + break; + } + return n; +} + +void writeP(struct node *n, int c) +{ + static int pag = 0; + static int len = 0; + static char buf[BUFSIZ]; + + if (len+1 == BUFSIZ || c == EOF) { + n->buf = (pag == 0) ? malloc(len) : realloc(n->buf, strlen(n->buf) + len); + memmove(n->buf, buf, len); + ++pag; + len = 0; + memset(buf, '\0', BUFSIZ); + } + + switch (c) { + case EOF: + pag = 0; + return; + case '\t': + strncat(buf, "&emsp;", 7); + len += 6; + return; + default: + strncat(buf, (char *)&c, 2); + len += 1; + return; + } +}