Rebol [ Encoding: "ASCII" Title: "make-doc-pro" Version: 1.3.0 Date: 28-Jan-2011/18:49:36+1:00 File: %make-doc-pro.r Author: ["Robert M. Muench" "Ladislav Mecir" "Henrik Mikael Kristensen"] Email: robert.muench@robertmuench.de Home: http://www.robertmuench.de/projects/mdp/ Copyright: { Robert M. Muench This parser can be freely used for non-commercial purposes. For commercial use, you have to contact the copyright owner. } Purpose: { Parses the make-doc-pro markup language into a datastructure that can be transformed into other document formats (such as HTML) with good titles, table of contents, section headers, indented fixed-spaced examples, bullets and definitions. } Category: [file markup text util 4] Library: [ level: 'advanced platform: 'all type: [dialect tool] domain: [dialects files html markup parse text text-processing web xml] tested-under: [view 1.2.8 2.7.7 [W2K XP OSX]] support: "See Rebol header" license: "See Rebol header" ] Note: {Based on make-doc.r from Carl Sassenrath, Rebol Technologies Inc.} Type: 'link-app ] ;{ ;-- Library paste BEGIN split: func ["Splits value" v [series!] rest [series!] /last] [ rest: either last [find/last v: copy v rest][find v: copy v rest] if rest [clear rest] v ] ; Stack Datastructure Object stack!: make object! [ stack: make block! [] push: func['value][ either (type? value) == block! [insert/only stack value] [insert stack value] ] pop: has [value] [ either (length? stack) > 0 [ value: first stack remove stack return value ] [return none] ] top: does [ if not empty? [return first stack] ] empty?: does [ either (length? stack) == 0 [return true][return false] ] ontop?: func ['value][ either value == top [return true][return false] ] instack?: func ['value /local result][ either result: find stack value [return index? result][return none] ] reset: does [ clear stack ] size: does [ return length? stack ] debug: does [ foreach entry stack [probe entry] ] ] ;--- Additional Functions assert: func [test [block!] text [string!]][ if not reduce test [ print reduce ["Asssert:" text "failed!"] ] exit ] pif: either value? 'case [:case] [ func [ [throw] { polymorphic if, minimum checking, no default, compatible with: computed blocks Return Exit Break non-logic conditions } args [block!]] [ if not unset? first args: do/next args [ either first args [either block? first args: do/next second args [do first args] [first args] ] [pif second do/next second args] ] ] ] ;-- Library paste END ;} ;-- global data ; debug_mode: true debug_mode: false ; light_mode: true ; light_mode: false ;--- make-doc-pro parser mdp-parser: context [ mdp-stack: make stack! [] ; storage to hold the mdp datastructure that will be the result of the parsing inline-stack: make stack! [] ; storage to hold mdp inline markup block active-stack: mdp-stack ; reference to active stack skip-counter: 0 ; counter how many chars of the input stream have been skipped (should be 0) rule-names: make stack! [] ; used to store rule-names for debugging lastemitted: none ; stores the last emitted name lastcode: none ; last parsed code refs: [] ; list of bookmark references todos: [] ; list of todo strings ;--Parser variables code: rest: mark: lastemitted_tmp: bullets: newcell_split: newrow_split: bullet_emit: skiped: definestart: boldstart: inline_para: none ;--Flags debugparse: true ; if true the mdp-parser will print debug messages debugparse: false ; if true the mdp-parser will print debug messages flags: make stack! [] ; stack of flags that are used to control the parser emit?: true ; turns emitting on and off ;--MDP-Stack handling emit: func ['name value /local tmp] [ ; trim all obsolete spaces if string? value [ if (back tail value) == " " [ trim/tail value if value = "" [exit] append value " " ] ] if emit? [ active-stack/push :name active-stack/push :value ] lastemitted: name ] emit-section: func [num /local tmp] [ if ref [repend refs [to-word ref trim text]] tmp: to-word join "sect" num emit :tmp text ] ;--Helper functions init: does [ mdp-stack/reset inline-stack/reset active-stack/reset rule-names/reset flags/reset lastcode: lastemitted: none skip-counter: 0 ; reset parse rule as this rule is altered after parsing the header ; titlerule: either light_mode [copy ["~~~"]][copy [opt "~~~"]] titlerule: copy [opt "~~~"] ] inputstream: func [width [integer!]][print ["###" mold copy/part mark width]] ; just place holders, will be replaced if =options debug is used, see parser rule below for code debug: func ['rule-name][] debugo: func [value][] debugf: does [] do-file: func [str file /local path res do-path] [ if file/1 = "%" [remove file] pif [ exists? file [text: last split-path file do-path: first split-path file] exists? join mdp-path file [text: join mdp-path file do-path: mdp-path] exists? join do_root file [text: join do_root file do-path: do_root] true [print ["Missing file to do:" file] exit] ] path: what-dir change-dir do-path if error? set/any 'res try [do load text] [print reform ["Error doing" text disarm res]] change-dir path ; need the output of do file to be printed. for some reason, the paragraph does not work. if all [value? 'res not error? :res not function? :res] [insert str res] ; this is still wrong ] insert-file: func [str file /code /local text] [ if file/1 = "%" [remove file] ; try to read the include file pif [ exists? file [text: read file] exists? join mdp-path file [text: read join mdp-path file] true [print ["Missing include file:" file] exit] ] ; insert code? either code [ emit example text ] [ ; insert the text from the include file up the end specifier or to the end insert/part str text any [find text "^/###" tail text] ] ] make-todo: func [text] [ last append todos make object! [ id: 1 + length? todos name: text status: none ] ] inline-parsing: func [text] [ if none? text [exit] text: trim text lastemitted_tmp: lastemitted active-stack: inline-stack either debugparse [ print ["Inline-Parsing:" text] print ["Inline-Parsing correct:" parse/all text inlinemarkup] ] [parse/all text inlinemarkup] active-stack: mdp-stack lastemitted: lastemitted_tmp either debugparse [reverse inline-stack/stack print ["Inline-Stack:" mold inline-stack/stack]] [reverse inline-stack/stack] ] ; ;--make-doc-pro parsing rules ; pdebug: [here: (prin "pdebug:" probe copy/part here 35)] ;Parsing storage variables text: none ; stores parsed text sequences para: none ; stores paragraph parts ref: none ; stores last stated reference ;Charactersets space: charset " ^-" spaces: [any space] nochar: charset " ^-^/" chars: complement nochar ;Helper rules line: [copy text to newline] ; copy the text from the actual stream position up to | or 'newline' (not including these chars) into 'text. The | is need because of table handling paragraph: [copy para some [chars [to newline | to end]]] word: [some space copy text some chars] ; skip spaces and copy all characters until the next whitespace ref-string: [opt [opt [some space] {"} copy ref to {"} 1 skip] line] ref-word: [some space copy ref some chars] ; copy ref part example: [copy code some [indented | newline indented] (lastcode: copy code)] indented: [some space opt chars to newline] ; this rule is used to parse the first line of a document which is the title. The title can either ; be marked with ~~~ or nothing. A title starting with no markup is only allowed once in a document. ; This rule is changed to ["~~~"] after the title has been parsed by removing 'opt titlerule: [opt "~~~"] ;--- Main rules mdp: [ some [ ;--Debug point mark: ;--Title and End of document titlerule (debug title) line (debugo text emit title text if (first titlerule) == 'opt [remove titlerule]) | "###" to end ;--Section Headers ; | ["===" | "-1-"] line (emit-section 1) ; | ["---" | "-2-"] line (emit-section 2) ; | ["+++" | "-3-"] line (emit-section 3) ; | ["..." | "-4-"] line (emit-section 4) | ["===" | "-1-"] ref-string (emit-section 1) | ["---" | "-2-"] ref-string (emit-section 2) | ["+++" | "-3-"] ref-string (emit-section 3) | ["..." | "-4-"] ref-string (emit-section 4) ;--Special common notations: | (debug define) define ( debugo text inline-parsing text ; really a define or only the : character es first char in a line either none? defword [emit paragraph copy inline-stack/stack] [ ; if there are several defines in a row, join them all in one table if lastemitted == 'define [emit define-join none] emit define reduce [defword copy inline-stack/stack] ] inline-stack/reset ) | "#" (debug numberitem) numberitem ( debugo text ; parse inline markup chars inline-parsing text ; and emit the parsed stack emit number copy inline-stack/stack ; clear inline stack inline-stack/reset ) | (debug bulletitem) bulletitem ( debugo text ; remember numbered-bullets if lastemitted == 'number [flags/push number-bullets debugf] ; parse inline markup chars, this will handle tables as well, solution see below inline-parsing text ; it could be that we entered this rule because the first character was a * but didn't introduced ; a bullet sequence but a bold sequence, this is the case if the length of bullets is 0 either (length? bullets) == 0 [ ; clean up state flag stack if lastemitted == 'number [flags/pop] emit paragraph copy inline-stack/stack ] [ ; inline-stack could now contain a newcell or newrow command, which would be emitted as a bullet item ; resulting in a wrong output because the closing bullet markup would be emitted after the newcell/newrow ; markup. The following code handles this situation be splitting out the tablehandling code ; split stack newcell or newrow as this ends our bulletitem newcell_split: split inline-stack/stack [[newcell #[none]]] newrow_split: split inline-stack/stack [[newrow #[none]]] ; the shorter of both will be emitted as bullet either (length? newcell_split) < (length? newrow_split) [bullet_emit: newcell_split] [bullet_emit: newrow_split] ; handle number-bullets case, don't clean up flags stack here, see below either flags/top == 'number-bullets [emit bullet reduce [(length? bullets) - 1 bullet_emit]] [emit bullet reduce [length? bullets bullet_emit]] ; the rest (copy required because the stack will be reset later) will be emitted as paragraph rest: "" ; default to empty string to avoid emitting loop length? bullet_emit [rest: remove inline-stack/stack] unless empty? rest [emit paragraph copy rest] ] ; probably an issue here with finishing the tag properly ; clear inline stack (and hence reference of rest (above) as well) inline-stack/reset ; handle lastemitted state correct and correct flags stack if flags/top == 'number-bullets [ flags/pop lastemitted: 'number ; fake so that possible further number-bullets will be handled correct ] ) | ";" to newline ; comment ;--Translator options | "=include" [some space "code" word (insert-file/code none join include_root to-file text) | word here: (insert-file here join include_root to-file text)] | "=do" word here: (do-file here to-file text) | "=meta" word (emit meta text) | "=flag" line (doc-flags: unique append doc-flags to-block text) | "=unflag" line (foreach flag to-block text [remove find doc-flags flag]) | (debug file) "=file" word (debugo text emit file text) | "=toc" (debug TOC) to newline (debugo "" emit toc none) | "=todos" to newline (emit todo-list none) | "=outline" (debug TOC) to newline (debugo "outline" emit toc 'outline) | "=language" word | "=options" some space some [ "faq" | "no-style" ( no_style: true ) | "debug" ( either debug_mode = true [ debug_mode: false debugparse: false debug: func ['rule-name][] debugo: func [value][] debugf: does [] ] [ debug_mode: true debugparse: true ; debug inline parsing ; debug just pushes the rule-name onto the stack ; this function might be called many times more than debugo that pops a value from the stack ; therefore we first make a pop and then a push, the first pop will be on an empty stack but that's ok per definition: nothing will happen debug: func ['rule-name][rule-names/pop rule-names/push rule-name] ; debug-out prints the rule-name; this indicates that the rule was called debugo: func [value][print reduce ["-->" rule-names/pop "--" mold value]] ; debugf dumps the flags' stack debugf: does [print "Flags Stack" flags/debug print "-----"] ] ) ] to newline ;--Special output | "=" copy bars some "-" (emit bar length? bars) | "=image" image to newline | "=caption" line (emit caption trim text) | "=todo" line (emit todo make-todo trim text) | "=url" some space [{"} copy url to {"} 1 skip | copy url some chars] line (either text == none [emit url reduce [url form url]][emit url reduce [url trim text]]) | "=bookmark" some space line (emit bookmark reduce [trim text]) | "=ref" ref-word (emit reference to-word trim ref) | "=image-root" line (image_root: to-file dirize trim text) | "=do-root" line (do_root: clean-path to-file dirize trim text) | "=include-root" line (include_root: to-file dirize trim text) | "=view" ( ; we use first as the stack isn't reversed yet. So the newest emitted stuff comes first. replace first mdp-stack/stack 'example 'view ) ;--Special sections: | "\inline" line ( flags/push inline emit inline-in all [text trim text] ) | "/inline" to newline ( either any [flags/empty? flags/top <> 'inline] [print "Flags-Stack not correct! Expecting /inline"] [flags/pop emit inline-out none] ) | "\in" to newline ( flags/push inin emit indent-in none ) | "/in" to newline ( either flags/top <> 'inin [print "Flags-Stack not correct! Expecting /in"] [flags/pop emit indent-out none] ) | "\note" line ( flags/push innote emit note-in text ) | "/note" to newline ( either flags/top <> 'innote [print "Flags-Stack not correct! Expecting /note"] [flags/pop emit note-out none] ) | "\table" [some space "header" (emit table-in 'tableheader) | (emit table-in none)] to newline ( flags/push intable ; keep track of tablemode on stack table: tablehandling ; change table rule to handle table characters ) | "/table" ( either any [flags/empty? flags/top = 'intable] [flags/pop emit table-out none] [print "Flags-Stack not correct! Expecting /table"] if any [flags/empty? flags/top <> 'intable] [ table: notablehandling ; change table rule to emit normal table characters when no more tables exist ] ) | "\form" word copy dialect to "/form" (emit form-script to-file text emit form dialect) | "/form" | "\plain" copy content to "/plain" (emit plain content) | "/plain" | "\history" | "/history" | "\if-not" line ( either find doc-flags to-word trim text [ emit?: false flags/push ifnotfalse ][ either find [iffalse ifnotfalse] flags/top [ emit?: false flags/push ifnotfalse ; do not allow embedded flags to pass ][ flags/push ifnottrue ] ] ) | "/if-not" to newline ( either find [ifnottrue ifnotfalse] flags/top [ flags/pop unless find [iffalse ifnotfalse] flags/top [emit?: true] ][ print "Flags-Stack not correct! Expecting /if-not" ] ) | "\if" line ( either find doc-flags to-word trim text [ either find [iffalse ifnotfalse] flags/top [ emit?: false flags/push iffalse ; do not allow embedded flags to pass ][ flags/push iftrue ] ][ emit?: false flags/push iffalse ] ) | "/if" to newline ( either find [iftrue iffalse] flags/top [ flags/pop unless find [iffalse ifnotfalse] flags/top [emit?: true] ][ print "Flags-Stack not correct! Expecting /if" ] ) ;--Example Text | (debug example) example (debugo code ; remove starting newlines while [(first code) == newline] [remove code] ; only once emit a header, else example ; this works because the header will follow the title line and this rule ; is triggered while doing the first round through this parse rule block. The title line rule AND header rule ; is triggered. If no example is given, the first newline (see rule below) will add the flag too, to avoid ; that the first ever used example will always be emitted as a header pif [ flags/instack? header [emit example code] true [emit header code flags/push header] ] ) ;--Text | (debug paragraph) paragraph (debugo para ; parse inline markup chars inline-parsing para pif [ lastemitted == 'bullet [emit bullet-join reduce [length? bullets copy inline-stack/stack] lastemitted: 'bullet] lastemitted == 'number [emit number-join para lastemitted: 'number] lastemitted == 'paragraph [emit paragraph-join none emit paragraph copy inline-stack/stack] true [emit paragraph copy inline-stack/stack] ] ; clear inline stack inline-stack/reset ) ;--Newline and join handling | newline [some newline ; This is the section handling 'newline 'newline ; If nothing special is needed, we reset lastemitted to none, so the rest of the parser behaves ; in default mode (for example bullet emitting in rule 'TEXT will be reset to normal text output. ( ; remember that we parsed the first newline, which ends the title line and the next indented text ; normally parsed as example has been recognized as boiler-plate header and was emitted ; This test is true if we only have a header line without boiler plate text following it, otherwise the ; example rule would have pushed the 'header flag already if not flags/instack? header [flags/push header] ; if we reach this point do some clean-up work as 'newline 'newline is the termination sequence ; for bullet lists, numbered lists etc. lastemitted: lastemitted_tmp: none if flags/top == 'number-bullets [flags/pop] ) | ; This is the section handling 'newline ( pif [lastemitted == 'header [emit header-join none]] ) ] ; This rule will skip everything from the input stream that we couldn't handle yet with any other rule | skiped: skip ; (print ["SKIP:" mold copy/part skiped 1] skip-counter: skip-counter + 1) ] ( ; cleanup stack if find to-string mdp-stack/top "join" [ mdp-stack/pop ] ) ] ; Tricky rules: These rules have to handle all kind of special cases for the defineword because a defineword can contain ; the separator character '-' as well. The trick is to use a break-rule to exit the any rule part in defineword and reset the ; input stream after the any rule. Than the defineseparator will be parsed again splitting the text into the two pieces ; defword and line that we need. (Thanks to Gabriele Santilli for this trick). define: [definestart: ":" copy defword defineword defineseparator copy text definition] definechars: complement charset " ^/" defineseparator: [spaces "-" spaces | #"^/" (defword: none) :definestart] defineword: [ any ; this hangs when no chars are available [ ; consume as much chars as possible any definechars (break-rule: none) [ ; if we have a defineseperator break-out -> defineseperator will be consumed in rule 'define tmp: defineseparator (break-rule: [end skip]) | #" " ] ; execute break-rule that will exit this rule break-rule ] ; reposition input stream to 'defineseperator position so this will be parsed :tmp ] definitionchars: complement charset "^/" definitionseparator: ["^/:" | "^/^/"] definition: [any [ ; read as much chars as possible any definitionchars (break-rule: none) [ ; if there is a definitionseperator found exit rule tmp: definitionseparator (break-rule: [end skip]) | "^/" ] break-rule ] ; reposition to 'definitionseperator for furthe parsing :tmp ] numberitem: [line] ; 1: parse as many * characters as possible ; 2: parse thru the text ; 3: check if this bullet item has bold text as well, in this case remove the * pair that's ; for bold text from the level counting bulletitem: [ boldstart: copy bullets some "*" opt [some boldchars] opt [ "*" ["^/" | "^-" | "|" | " " | "," | ";" | "." | "!" | "?"] (remove bullets)] (boldstart: skip boldstart length? bullets) :boldstart line ] ;-- Inline markup character handling parachars: complement charset "|~_-*=" ; ^/" ; |=" markupdelimiters: [[" " | "." | "," | ";" | "|" | "||" | newline | none]] boldchars: complement charset "*|^/" underlinechars: complement charset "_^/" italicchars: complement charset "~^/" strikechars: complement charset "-^/" parapart: [copy inline_para some parachars] tablehandling: [ ; [!] - this fails if a table is ended inside a cell (debug newrow) "||" (debugo none emit newrow none lastemitted_tmp: none) ; This will handle empty cells at the begin of a line | (debug newcell) "|" (debugo none emit newcell none lastemitted_tmp: none) ] notablehandling: ["|" (emit parapart "|")] table: notablehandling ; mark: (mark: skip mark -1) :mark -> Move input cursor one char to front to check special chars inlineprolog: [mark: (mark: skip mark -1) :mark markupdelimiters] inlineepilog: [mark: (mark: skip mark -1) :mark chars] inlineend: [mark: (if (length? mark) == 0 [insert markupdelimiters 'opt]) markupdelimiters (if (length? mark) == 0 [remove markupdelimiters]) :mark] inlinemarkup: [ some [ (debug parapart) parapart (debugo inline_para emit parapart inline_para) ; Tricky rules: ; 1. we parse 'markupdelimiter AND inline markup character ; 2. Next we check for a char, a whitespace is not allowed as this would indicate that the markup char should be emitted ; 3. and reposition the input stream to get this char in the following copy sequence as well ; 4. we catch all characters that are not the inline markup character ; 5. we check for a char AND the closing inline markup character ; 6. we check if this closing inline markup character is followed by delimiter so that we can be sure it's not the inline character we should emit | (debug bold) inlineprolog "*" mark: chars :mark copy boldtext some boldchars [ ; the if part is needed as the string could directly end with an inlinemerkup character inlineepilog "*" inlineend (debugo none emit bold boldtext) | newline (debugo none emit parapart rejoin ["*" boldtext]) ] | (debug italic) inlineprolog "~" mark: chars :mark copy italictext some italicchars [ inlineepilog "~" inlineend (debugo none emit italic italictext) | newline (debugo none emit parapart rejoin ["~" italictext]) ] | (debug strike) inlineprolog "-" mark: chars :mark copy striketext some strikechars [ ; if the inlinemarkup char is the last in the line we have to make the check for the markupdelimiters optional ; and reposition the input-sequence pointer inlineepilog "-" inlineend (debugo none emit strike striketext) | newline (debugo none emit parapart rejoin ["-" striketext]) ] | (debug underline) inlineprolog "_" mark: chars :mark copy underlinetext some underlinechars [ inlineepilog "_" inlineend (debugo none emit underline underlinetext) | newline (debugo none emit parapart rejoin ["_" underlinetext]) ] ; --Special handling | (debug image) "=image" image | (debug url) "=url" some space [{"} copy url to {"} 1 skip | copy url some chars] copy text to "=" "=" ( either text == none [emit url reduce [url form url]] [emit url reduce [url trim text]] ) ; -- Single char handling | (debug star) "*" (debugo none emit parapart "*") | (debug snail) "~" (debugo none emit parapart "~") | (debug minus) "-" (debugo none emit parapart "-") | (debug under) "_" (debugo none emit parapart "_") | (debug equal) "=" (debugo none emit parapart "=") ;--Table handling | (debug table) table (debugo none) ; This rule will skip everything from the input stream that we couldn't handle yet with any other rule | skiped: skip (print ["Inline SKIP:" mold copy/part skiped 1] skip-counter: skip-counter + 1) ] ] ; check alignment alignement: [ some space [ "left" (emit align 'left) | "right" (emit align 'right) | "center" (emit align 'center) | "float" (emit paragraph-join none emit align 'float) ] ] ; handles images image: [opt alignement some space copy text some chars (emit image join image_root to-file text)] ] ; context mdp-parser html-emitter: context [ output: [] flags: make stack! [] alignment: none ; used to temporarly store an alignment hint name: none ; these two hold the current item (name/value) of the parsed mdp-stack value: none path: "" ; used in site_mode for related pages section sects: [0 0 0 0] ; this is the counter for our 4 level sections toc-title: "Contents" ; text to use for TOC img-num: 0 ; counter for generated images ;--Helper functions init: does [ clear output flags/reset alignment: name: value: none img-num: 0 clear-sects ; sects: [0 0 0 0] ] nsp: "^/ " ; nsp = newline-space html-codes: [ "&" "&" "<" "<" ">" ">" {"} """ ] escape-html: func [text][ if any [word? text none? text empty? text] [return text] foreach [from to] html-codes [replace/all/case text from to] return text ] ; detect the outdent by each newline for the shortest line outdent: func [text /local spacing spaces][ spacing: 100000 spaces: complement charset " " foreach line parse/all text "^/" [if line: find line spaces [spacing: min spacing index? line]] spacing: spacing - 1 txt: remove/part text spacing while [txt: find/tail txt newline] [remove/part txt spacing] text ] emit: func [data] [append output reduce data] ; Reset all section counter to 0 clear-sects: does [change/dup sects 0 4] ; Increase section counters, create section counter string and return this string sect-num?: func [num /local n sn] [ ; increase section counter at num place by 1 change at sects num n: sects/:num + 1 ; reset all section counters behind 'num to 0 change/dup at sects num + 1 0 4 - num ; initialize local variable sn: copy "" ; append num times the section counter to form a w.x.y.z number repeat n num [append sn join sects/:n "."] ; remove trailing point remove back tail sn ; return the created number copy sn ] ;--- Predefined HTML emitter objects html-copyright: [
; -- add your footer information stuff below --
; ---add your footer information stuff above --
"Document formatter copyright " "Robert M. Münch"". All Rights Reserved."
"XHTML 1.0 Transitional formatted with Make-Doc-Pro Version:" system/script/header/version " on " now/date " at " now/time
| any [escape-html value/1 " "] | ] ;emit the definition text emit; emit inline markup paragraph value/2 ; and close definition, if further definitions come along define-join will be called emit [ |