%%           -*-Mode: prolog;-*-

:- multifile rx/2.
:- multifile macro/2.

:- discontiguous macro/2.
:- discontiguous rx/2.

macro(not(X),ext_sigma* - X).
macro($$(X),[ext_sigma*,X,ext_sigma*]).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Leftmost/Longest Contexted Replacement %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

macro(replace(T0,Left0,Right), pragma([T-T0,Left-Left0],cleanup(
      non_markers
          o
       m(r(Right))
          o
      f(domain(T))
          o
 left_to_right(domain(T))
          o
 longest_match(domain(T))
          o
    aux_replace(T)
          o
       l1(Left)
          o
       l2(Left)
          o
   inverse(non_markers) ))
     ).

macro(replace(T0), pragma([T-T0], cleanup(
      non_markers
          o
       m(r([]))
          o
      f(domain(T))
          o
 left_to_right(domain(T))
          o
 longest_match(domain(T))
          o
    aux_replace(T)
          o
        l1([])
          o
        l2([])
          o
   inverse(non_markers)))
     ).

%%%%%%%%%%%%%%%%%%%%%%%
%%% Special Symbols %%%
%%%%%%%%%%%%%%%%%%%%%%%

macro(bool,{0,1}).

macro(Marker,[Mrk,1]) :-
    marker_symbol(Marker,Mrk).

marker_symbol(lbrack1,'<1').
marker_symbol(lbrack2,'<2').
marker_symbol(rbrack2,'2>').
marker_symbol(rbrack1,'1>').

macro(lbrack,{lbrack1,lbrack2}).
macro(rbrack,{rbrack1,rbrack2}).
macro(brack1,{lbrack1,rbrack1}).
macro(brack2,{lbrack2,rbrack2}).

macro(brack, {rbrack, lbrack }).

macro(ext_sigma,[?,bool]).
macro(sigma,ext_sigma - brack).

macro(non_markers, [identity(?),[] x 0]*).
macro(non_markers(Expr), 
      range(identity(Expr) o [identity(?),[] x 0]*)).


%%%%%%%%%%%%%
%%% Utils %%%
%%%%%%%%%%%%%

macro(intro_(E2),{ext_sigma,[] x E2}*).
macro(xintro_(E2),{[],[ext_sigma,intro_(E2)]}).
macro(intro_x(E2),{[],[intro_(E2),ext_sigma]}).
macro(xintro_x(E2),{[],[ext_sigma],[ext_sigma,
					    intro_(E2),
					    ext_sigma]}).

macro(ignore_(E1,E2),  range(E1 o intro_(E2))).
macro(xignore_(E1,E2), range(E1 o xintro_(E2))).
macro(ignore_x(E1,E2), range(E1 o intro_x(E2))).
macro(xignore_x(E1,E2),range(E1 o xintro_x(E2))).

%%%%%%%%%%%%%%%%%%%%%%%
%%% Main Procedures %%%
%%%%%%%%%%%%%%%%%%%%%%%

macro(r(R),
   if_right_then_insert_m(rbrack2,non_markers(R))).

%% This diverges from Mohri & Sproat. Here we just insert just lbrack2
%% and not {lbrack1,lbrack2}.
%macro(f(F),
%   if_right_then_insert_m(lbrack2,
%                          [xignore_(non_markers(F),rbrack2),rbrack2])).


macro(intro(S0),pragma([S-S0], {ext_sigma-S,[] x S}*)).

macro(f(F), intro(lbrack2)
               o
       l_iff_r(lbrack2,[[xignore_x(non_markers(F),
					       brack2),
			 lbrack2^]-lbrack2, % Modified from
             rbrack2])).                    % published version

macro(if_p_then_s(L1,L2), 
      not([L1,not(L2)])).
macro(if_s_then_p(L1,L2), 
      not([not(L1),L2])).
macro(p_iff_s(V1,V2), pragma([L1-V1,L2-V2],
      if_p_then_s(L1,L2) 
            & 
      if_s_then_p(L1,L2))).

macro(l_iff_r(L,R), 
   p_iff_s([ext_sigma*,L],[R,ext_sigma*])).


macro(left_to_right(Phi), 
      [[ext_sigma*,
        [lbrack2 x lbrack1,
          (  ignore_(non_markers(Phi),brack2)   %% was ignore_x
                     o 
           [[{sigma,rbrack2}*,
	     lbrack2 x []]*,
	    {sigma,rbrack2}*]
         ),
         rbrack2 x rbrack1]
        ]*,
        ext_sigma*]
     ).

macro(longest_match(Phi0), pragma([Phi-Phi0], 
      not($$([lbrack1,
           (ignore_x(non_markers(Phi),brack)
                        &
           [ignore_(non_markers(Phi),brack),rbrack1,? *]),
           rbrack
          ]))
              o
   [[(ext_sigma - rbrack2)*,
     rbrack2 x []]*,
    (	ext_sigma - rbrack2)*]
     )).

macro(aux_replace(T), 
      {{sigma,lbrack2}, 
       [lbrack1,
        ( inverse(non_markers)
                 o
                 T
                 o
             non_markers
        ),
	rbrack1 x []
       ]
      }*).

macro(l1(L), 
      ignore_(if_m_then_left(lbrack1,non_markers(L)),
		  lbrack2)
                            o
      [[{sigma,lbrack2}*,lbrack1 x []]*,{sigma,lbrack2}*]
     ).

macro(l2(L),
   if_m_then_not_left_delete(lbrack2,non_markers(L))).

%%%%%%%%%%%%%
%%% Tests %%%
%%%%%%%%%%%%%

%% Use this test with the string abababa
macro(test1,replace(([a,b] x x),[a,b],a)).

%% Use this test with the string <1 <2 <1 <2 <1 <2 <1 
macro(test2,replace((['<1','<2'] x '1>'),['<1','<2'],'<1')).

%% Use test string: abbaa
macro(test3,replace(({[a,b],b,[b,a],[a,b],a} x x),{[a,b],b},a)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% User Marker Functions %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% The basic maker functions as defined by Mohri & Sproat are too
%% low level to be easily used directly. The following high-level
%% macros should simplify use in four ways:
%% 
%% 1. The macros provide mnemonic names.
%%
%% 2. The macros are designed to handle markers which are a sequence
%%    of a some special symbol followed by a 1. This allows special
%%    marker symbols to be distinguished from ordinary symbols, which
%%    are followed by 0. So any special marker symbol followed by a 0
%%    will also be interpreted a just another ordinary symbol.
%%
%% 3. The macros encapsulate the need for the tricky use of reverse
%%    embedded inside of reverse in the case of markers sensitive to
%%    right context.
%%
%% 4. For marker2 and marker3, the option is provided of either just
%%    checking the markers or checking and simultaneously deleting the
%%    markers.

macro(if_left_then_insert_m(Marker0,Left),
   marker1( [{[? - Marker,bool],[Marker,0]}*,Left],
            seq(Marker,1))) :-
   marker_symbol(Marker0,Marker).

macro(if_right_then_insert_m(Marker0,Right),
   reverse(
      marker1(reverse([Right,{[? - Marker,bool],[Marker,0]}*]),
              seq(1,Marker)))) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_left(Marker0,Left),
   marker2( [{[? - Marker,bool],[Marker,0]}*,Left],
            seq(Marker,1),
            filter)) :-
   marker_symbol(Marker0,Marker).
       
macro(if_m_then_left_delete(Marker0,Left),
   marker2( [{[? - Marker,bool],[Marker,0]}*,Left],
            seq(Marker,1),
            delete)) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_right(Marker0,Right),
   reverse(
      marker2(reverse([Right,{[? - Marker,bool],[Marker,0]}*]),
              seq(1,Marker),
              filter))) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_right_delete(Marker0,Right),
   reverse(
      marker2(reverse([Right,{[? - Marker,bool],[Marker,0]}*]),
              seq(1,Marker),
              delete))) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_not_left(Marker0,Left),
   marker3( [{[? - Marker,bool],[Marker,0]}*,Left],
            seq(Marker,1),
            filter)) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_not_left_delete(Marker0,Left),
   marker3( [{[? - Marker,bool],[Marker,0]}*,Left],
            seq(Marker,1),
            delete)) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_not_right(Marker0,Right),
   reverse(
      marker2(reverse([Right,{[? - Marker,bool],[Marker,0]}*]),
              seq(1,Marker),
              filter))) :-
   marker_symbol(Marker0,Marker).

macro(if_m_then_not_right_delete(Marker0,Right),
   reverse(
      marker2(reverse([Right,{[? - Marker,bool],[Marker,0]}*]),
              seq(1,Marker),
              delete))) :-
   marker_symbol(Marker0,Marker).

%%%%%%%%%%%%%%
%%% type 1 %%%
%%%%%%%%%%%%%%

%% Type 1 has now been simplified so that it will insert just one kind
%% of marker in the appropriate position. This is no longer sufficient
%% to implement Mohri & Sproat's algorithm, since in their step f,
%% they the markers <1 and <2. 

rx(marker1(Expr,Change),Fa) :-
   fsa_regex:rx(identity(determinize(Expr)),Fa0),
   mark1(Change,Fa0,Fa).

mark1(seq(A0,B0),Fa1,Fa) :-
    fsa_data:fsa_symbols_decl(Fa1,Sigma),
    (	Sigma=r(Module)
    ;   Sigma=t(Module,_)
    ),
    Module:regex_notation_to_predicate(A0,A),
    Module:regex_notation_to_predicate(B0,B),    
    fsa_data:fsa_start_states(Fa1,Starts),
    fsa_data:fsa_transitions(Fa1,Trans0),
    fsa_data:fsa_final_states(Fa1,Finals),
    fsa_data:fsa_states_set(Fa1,AllStates),
    ordsets:ord_subtract(AllStates,Finals,NewFinals0),
    add_insertions(Finals,seq(A,B),NewFinals,NewFinals0,Trans,Trans1),
    replace_transitions_starting_final(Trans0,Trans1,Fa1),
    fsa_data:fsa_construct_rename_states(Sigma,Starts,NewFinals,Trans,[],Fa).

%% this is slow for large automata
%% in such cases it's worthwhile creating a thing for final_states
%% moreover it creates symbolic names first, which are renamed later
%% it would be better to create larger integers for state names here
%% immediately
replace_transitions_starting_final([],[],_).
replace_transitions_starting_final([trans(A0,B,C)|T0],[trans(A,B,C)|T],Fa):-
    (	fsa_data:fsa_final_state(Fa,A0)
    ->	A=q(A0)
    ;	A=A0
    ),
    replace_transitions_starting_final(T0,T,Fa).

add_insertions([],_,F,F) --> [].
add_insertions([F0|Fs],seq(A,B),[q(F0)|NewF0],NewF) -->
    [trans(F0,[]/A,q1(F0)),trans(q1(F0),[]/B,q(F0))],
    add_insertions(Fs,seq(A,B),NewF0,NewF).

%%%%%%%%%%%%%%
%%% type 2 %%%
%%%%%%%%%%%%%%

%% Type 2 and Type 3 are now provided with a flag to determine
%% whether or not the markeer should be deleted after checking.

rx(marker2(Expr,Change,Flag),Fa) :-
    (	Flag == filter
    ->	fsa_regex:rx(determinize(Expr),Fa0)
    ;	fsa_regex:rx(identity(determinize(Expr)),Fa0)
    ),
    mark2(Change,Flag,Fa0,Fa).

mark2(seq(A0,B0),Flag,Fa1,Fa) :-
    fsa_data:fsa_symbols_decl(Fa1,Sigma),
    (	Sigma=r(Module)
    ;   Sigma=t(Module,_)
    ),
    Module:regex_notation_to_predicate(A0,A),
    Module:regex_notation_to_predicate(B0,B),    
    fsa_data:fsa_copy_except(transitions,Fa1,Fa2,Trans0,Trans),
    fsa_data:fsa_copy_except(final_states,Fa2,Fa3,Finals,AllStates),
    fsa_data:fsa_copy_except(states,Fa3,Fa,Num0,Num),
    fsa_data:fsa_states_set(Fa1,AllStates),
    (	Flag == filter
    ->	add_2_filters(Finals,seq(A,B),Num0,Num,Trans1,Trans0)
    ;	add_2_deletions(Finals,seq(A,B),Num0,Num,Trans1,Trans0)
    ),
    sort(Trans1,Trans).

%%%%%%%%%%%%%%
%%% type 3 %%%
%%%%%%%%%%%%%%

rx(marker3(Expr,Change,Flag),Fa) :-
    (	Flag == filter
    ->	fsa_regex:rx(determinize(Expr),Fa0)
    ;	fsa_regex:rx(identity(determinize(Expr)),Fa0)
    ),
    mark3(Change,Flag,Fa0,Fa).

mark3(seq(A0,B0),Flag,Fa1,Fa) :-
    fsa_data:fsa_symbols_decl(Fa1,Sigma),
    (	Sigma=r(Module)
    ;   Sigma=t(Module,_)
    ),
    Module:regex_notation_to_predicate(A0,A),
    Module:regex_notation_to_predicate(B0,B),    
    fsa_data:fsa_copy_except(transitions,Fa1,Fa2,Trans0,Trans),
    fsa_data:fsa_copy_except(final_states,Fa2,Fa3,Finals,AllStates),
    fsa_data:fsa_copy_except(states,Fa3,Fa,Num0,Num),
    fsa_data:fsa_states_set(Fa1,AllStates),
    ordsets:ord_subtract(AllStates,Finals,NonFinals),
    (	Flag == filter
    ->	add_2_filters(NonFinals,seq(A,B),Num0,Num,Trans1,Trans0)
    ;	add_2_deletions(NonFinals,seq(A,B),Num0,Num,Trans1,Trans0)
    ),
    sort(Trans1,Trans).

add_2_filters([],_,Num,Num) --> [].
add_2_filters([Final|Fs],seq(A,B),Num0,Num) -->
    [trans(Final,A,Num0),trans(Num0,B,Final)],
    {user:is(Num1,+(Num0,1))},
    add_2_filters(Fs,seq(A,B),Num1,Num).

add_2_deletions([],_,Num,Num) --> [].
add_2_deletions([Final|Fs],seq(A,B),Num0,Num) -->
    [trans(Final,A/[],Num0),trans(Num0,B/[],Final)],
    {user:is(Num1,+(Num0,1))},
    add_2_deletions(Fs,seq(A,B),Num1,Num).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Longest-Match Concatenate %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

macro(lm_concat(Ts),mark_boundaries(Domains) o ConcatTs) :-
   extract_domains(Ts,Domains),
   concatT(Ts,ConcatTs).

extract_domains([],[]).
extract_domains([F|R0],[domain(F)|R]) :-
   extract_domains(R0,R).

concatT([],[]).
concatT([T|Ts], 
        [inverse(non_markers) o T,mark x []|RestConcat]) :-
   concatT(Ts,RestConcat).

macro(mark,[#,1]).

%% mark_boundaries
%% ---------------
%% This defines the markup transducer. As an example try:
%%
%% mark_boundaries([a1,a2,a3])
%% 
%% Then try out the transducer with the string (set option 'chars' to
%% on first):
%% 
%% topological
%%
%% The result you should get is:
%% 
%% top#o#logical
%%
%% The marker used is the letter "#". The placement of the two #'s
%% indicates that a1 matched against "top", a2 matched against "o"
%% and a3 matched against "logical".

macro(mark_boundaries(L),[Exp,[] x mark]):-
   boundaries(L,Exp0),
   greed(L,Exp0,Exp).

%% boundaries
%% ----------
%% The markup proceeds in a generate-and-test fashion. The
%% "boundaries" macro is a generator, which guesses a markup for the
%% string.

macro(boundaries(L),Exp) :-
   boundaries(L,Exp).

boundaries([H|T],Output) :-
    boundaries(T,H,Output).

boundaries([],F,[F o non_markers]).
boundaries([H|R0],F,[F o non_markers, [] x mark |R]) :-
   boundaries(R0,H,R).

%% greed(ListOfRegexes,Alphabet,Exp2BFiltered,ComposedFilters)
%% -----------------------------------------------------------
%% The ComposedFilters will check a marked-up string of the form
%% x1mx2m...xn, where x1 ... xn are sequences and 'm' is the
%% character 'm' used as a marker. Each x_i is accepted by
%% Regex_i. The filters ensure that, going from left to right through
%% ListOfRegexes, each Regex has applied as greedily as possible in
%% marking up x1 ... xn.
%%
%% As an example, for the goal greed([a1,a2,a3],regex,Result),
%% Result will be bound to:
%%
%%                            regex 
%%
%%                              o 
%%
%%  identity( ~ [ignore_spx(a1,#),ignore_s(a2,#),ignore_s(a3,#)]))
%%
%%                               o 
%%
%% identity( ~ [non_markers(a1),#,ignore_spx(a2,#),ignore_s(a3,#)])) 
%%
%% Filter 1 forces a1 to be as greedy as possible, while Filter 2
%% forces a2 to be as greedy as possible. 

greed(L,Exp2BFiltered,ComposedFilters) :-
   aux_greed(L,[],Filters),
   compose_filters(Filters,Exp2BFiltered,ComposedFilters).

aux_greed([H|T],Front,Filters):-
    aux_greed(T,H,Front,Filters,_CurrentFilter).

aux_greed([],F,_,[],[ignore_(non_markers(F),mark)]).
aux_greed([H|R0],F,Front,[~ L1|R],[ignore_(non_markers(F),mark)|R1]) :-
   lists:append(Front,[ignorex_1(non_markers(F),mark)|R1],L1),
   lists:append(Front,[non_markers(F),mark],NewFront),
   %% R is for the rest of the filters to be constructed
   %% R1 is for the rest of the current filter being constructed
   aux_greed(R0,H,NewFront,R,R1).

%% extension '_1': ignore at least one instance
macro(ignorex_1(E1,E2),
   range(identity(E1) o [[identity(?)*,[] x E2]+,identity(?)+])#).

%% compose_filters(Patterns,Exp2BFiltered,ComposedFilters)
%% ------------------------------------------------------
%% Patterns is a list of patterns to be filtered out by
%% ComposedFilters.

compose_filters([], SoFar, SoFar). %% :- !.
compose_filters([F|R], SoFar, Composed) :-
   compose_filters(R,(SoFar o identity(F)),Composed).

%% (this is an example from Jeffrey Friedl's Mastering Regular
%% Expressions, O'Reilly, 1997, page 177. 
%% the objective being to mimic a `longest match' preference')
%%
%% This was implemented as one part of an algorithm for compiling
%% an extended form of rewrite rules into transducers. See:
%% 
%%     Dale Gerdemann, Gertjan van Noord, "Transducers From Rewrite
%%     Rules with Backreferences" EACL 99.
%%
%% a1, a2, a3
%% ----------
%% This defines the three regexes used in Jeffrey Friedl's example
%% on p. 117. The idea is that the string:
%%   
%%     t o p o l o g i c a l
%%
%% can be partitioned in various ways into x1.x2.x3 such that x1
%% is recognized by a1, x2 by a2 and x3 by a3. The goal is to find 
%% the partitioning that allows a1 to match as long a string as
%% possible, with a2 matching the maximum possible of what's left
%% and so on. 
%% If we simply match "t o p o l o g i c a l" against [a1,a2,a3], we
%% get a yes/no answer, with no clue as to what portion of the string
%% was matched by a1, a2 and a3. So instead what we will do is to use
%% the sequence [a1,a2,a3] to build a transducer which inserts some
%% kind of marker after the portion of the string matched by a1 and
%% a2. Filters are defined to ensure that this transducer will only
%% output the "longest-match" partitioning.

%%%%%%%%%%%%%
%%% Tests %%%
%%%%%%%%%%%%%

macro(a1,minimize({[t,o],[t,o,p]})).
macro(a2,minimize({[o],[p,o,l,o]}^)).
macro(a3,minimize({[g,i,c,a,l],[o^,l,o,g,i,c,a,l]})).

macro(t1,minimize({[t,o] x to,[t,o,p] x top})).
macro(t2,minimize({[] x epsilon, o x o,[p,o,l,o] x polo})).
macro(t3,minimize({[g,i,c,a,l] x gical,
                     [o,l,o,g,i,c,a,l] x ological,
                     [l,o,g,i,c,a,l] x logical})).

macro(test0,lm_concat([t1,t2,t3])).


%% or:
%% fsa -v -aux replace -r 'range(word(topological) o test0)' \
%%           | fsa symbol_separator=32 -p