%%           -*-Mode: prolog;-*-

:- multifile rx/2.
:- multifile macro/2.

:- discontiguous rx/2.
:- discontiguous macro/2.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Leftmost/Longest Contexted Replacement %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

macro(replace(T0,Left0,Right),
      pragma([T-T0,Left-Left0], cleanup(
       sigma*
          o
       r(Right)
          o
      f(domain(T))
          o
 left_to_right(domain(T))
          o
 longest_match(domain(T))
          o
    aux_replace(T)
          o
       l1(Left)
          o
       l2(Left)
          o
       sigma*
						     )
     )).

macro(replace(T0), pragma([T-T0], cleanup(
       sigma*
          o
     [[]:'2>', [? - '2>', []:'2>']*]
          o
      f(domain(T))
          o
 left_to_right(domain(T))
          o
 longest_match(domain(T))
          o
    aux_replace(T)
          o
     {`{'1>','2>','<1'}, '<1':[]}*
          o
      (? - '<2')*
          o
       sigma*
				     )
     )).


%%%%%%%%%%%%%%%%%%%%%%%
%%% Special Symbols %%%
%%%%%%%%%%%%%%%%%%%%%%%

macro(rbrack,  {'1>','2>'}           ).
macrl(lbrack,  {'<1','<2'}           ).
macro(brack1,  {'<1','1>'}           ).
macro(brack2,  {'<2','2>'}           ).
macro(brack,   {'<1','<2','1>','2>'} ).
macro(sigma,  `{'<1','<2','1>','2>'} ).

%%%%%%%%%%%%%
%%% Utils %%%
%%%%%%%%%%%%%

macro(intro_(E2),{? ,[] x E2}*).
macro(xintro_(E2),{[],[? ,intro_(E2)]}).
macro(intro_x(E2),{[],[intro_(E2),? ]}).
macro(xintro_x(E2),{[],[? ],[? ,intro_(E2),? ]}).

macro(ignore_(E1,E2),range(E1 o intro_(E2))).
macro(xignore_(E1,E2),range(E1 o xintro_(E2))).
macro(ignore_x(E1,E2),range(E1 o intro_x(E2))).
macro(xignore_x(E1,E2),range(E1 o xintro_x(E2))).

%%%%%%%%%%%%%%%%%%%%%%%
%%% Main Procedures %%%
%%%%%%%%%%%%%%%%%%%%%%%

macro(r(R),
   if_right_then_insert_m('2>',sigma* & R)).

%% This diverges from Mohri & Sproat. Here we just insert just '<2'
%% and not {'<1','<2'}.
%macro(f(F),
%   if_right_then_insert_m('<2',
%                          [xignore_(F,'2>'),'2>'])).

macro(intro(S0),pragma([S-S0], {? - S,[] x S}*)).

macro(f(F), intro('<2')
               o
       l_iff_r('<2',[[xignore_x(sigma* & F,{'<2','2>'}),
			 '<2'^]-'<2', % Modified from
             '2>'])).                 % published version

macro(if_p_then_s(L1,L2), 
      ~[L1,~L2]).
macro(if_s_then_p(L1,L2), 
      ~[~L1,L2]).
macro(p_iff_s(V1,V2), pragma([L1-V1,L2-V2],
      if_p_then_s(L1,L2) 
            & 
      if_s_then_p(L1,L2))).

macro(l_iff_r(L,R), 
   p_iff_s([? *,L],[R,? *])).

macro(left_to_right(Phi), 
      [[? *,
        ['<2' x '<1',
          (  ignore_(sigma* & Phi,brack2)   %% was ignore_x
                     o 
           [[{sigma,'2>'}*,'<2' x []]*,{sigma,'2>'}*]
         ),
         '2>' x '1>']
        ]*,
        ? *]
     ).

macro(longest_match(Phi0), pragma([Phi-Phi0],
     ( ~ $ ['<1',
           (ignore_x(sigma* & Phi,brack)
                        &
           [ignore_(sigma* & Phi,brack),'1>',? *]),
           rbrack
          ])
              o
       {? - '2>', '2>':[]}*
     )).

macro(aux_replace(T), 
      {{sigma,'<2'}, 
       ['<1', sigma* o T o sigma*, '1>' x [] ]
      }*).

macro(l1(L), 
      ignore_(if_m_then_left('<1',sigma* & L), '<2')
                            o
      [[{sigma,'<2'}*,'<1' x []]*,{sigma,'<2'}*]
     ).

macro(l2(L),
   if_m_then_not_left_delete('<2',sigma* & L)).

%%%%%%%%%%%%%
%%% Tests %%%
%%%%%%%%%%%%%

%% Use this test with the string abababa
macro(test1,replace(([a,b] x x),[a,b],a)).

%% Use this test with the string <1 <2 <1 <2 <1 <2 <1 
macro(test2,replace((['<1','<2'] x '1>'),['<1','<2'],'<1')).

%% Use test string: abbaa
macro(test3,replace(({[a,b],b,[b,a],[a,b],a} x x),{[a,b],b},a)).

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% User Marker Functions %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%% The basic maker functions as defined by Mohri & Sproat are too
%% low level to be easily used directly. The following high-level
%% macros should simplify use in four ways:
%% 
%% 1. The macros provide mnemonic names.
%%
%% 3. The macros encapsulate the need for the tricky use of reverse
%%    embedded inside of reverse in the case of markers sensitive to
%%    right context.
%%
%% 4. For marker2 and marker3, the option is provided of either just
%%    checking the markers or checking and simultaneously deleting the
%%    markers.

macro(if_left_then_insert_m(Marker,Left),
   marker1( [(? - Marker) *,Left],Marker)).

macro(if_right_then_insert_m(Marker,Right),
   reverse(marker1(reverse([Right,(? - Marker) *]),Marker))).

macro(if_m_then_left(Marker,Left),
      marker2( [(? - Marker) *,Left], Marker, filter)).
       
macro(if_m_then_left_delete(Marker,Left),
      marker2( [(? - Marker) *,Left], Marker, delete)).

macro(if_m_then_right(Marker,Right),
   reverse(marker2(reverse([Right,(? - Marker) *]),Marker,filter))).

macro(if_m_then_right_delete(Marker,Right),
   reverse(marker2(reverse([Right,(? - Marker) *]),Marker,delete))).

macro(if_m_then_not_left(Marker,Left),
   marker3( [(? - Marker) *,Left],Marker,filter)).

macro(if_m_then_not_left_delete(Marker,Left),
   marker3( [(? - Marker) *,Left],Marker,delete)).

macro(if_m_then_not_right(Marker,Right),
   reverse(marker2(reverse([Right,(? - Marker) *]),Marker,filter))).

macro(if_m_then_not_right_delete(Marker,Right),
   reverse(marker2(reverse([Right,(? - Marker) *]),Marker,delete))).

%%%%%%%%%%%%%%
%%% type 1 %%%
%%%%%%%%%%%%%%

%% Type 1 has now been simplified so that it will insert just one kind
%% of marker in the appropriate position. This is no longer sufficient
%% to implement Mohri & Sproat's algorithm, since in their step f,
%% they use the markers <1 and <2. 

rx(marker1(Expr,Change),Fa) :-
   fsa_regex:rx(identity(determinize(Expr)),Fa0),
   mark1(Change,Fa0,Fa).


dom_module(r(Module),Module).
dom_module(t(Module,_),Module).

mark1(A0,Fa1,Fa) :-
    fsa_data:fsa_components(Sigma,NrS0,Starts,Finals,Trans0,[],Fa1),
    fsa_data:fsa_components(Sigma,NrS, Starts,NewFinals,Trans,[],Fa),
    dom_module(Sigma,Module),
    Module:regex_notation_to_predicate(A0,A),
    fsa_data:fsa_states_set(Fa1,AllStates),
    ordsets:ord_subtract(AllStates,Finals,NewFinals0),
    lists:append(NewFinals0,NewFinals1,NewFinals),
    add_insertions(Trans0,Trans,Trans2,Trans2,Finals,NewFinals1,NrS0,NrS,A).

%% for every final state F, insert transition from F to some new final
%% state F', inserting Marker.
%% Moreover, all transitions that leave F in original automaton, now
%% leave from F'

add_insertions([],Ts,Ts,[],_,[],NrS,NrS,_).
add_insertions([trans(A,B,C)|Trs],Ts0,Ts,Ts2,Fins,Fs,NrS0,NrS,Mrk):-
    ord_member(A,Fins,Bool),
    (	Bool==yes
    ->	Ts0=[trans(A,[]/Mrk,NrS0)|Ts1],
	Ts2=[trans(NrS0,B,C)|Ts20],
	Fs=[NrS0|Fs0],
	NrS1 is +(NrS0,1),
	add_insertions0(Trs,A,Ts1,Ts,Ts20,Fins,Fs0,NrS0,NrS1,NrS,Mrk)
    ;   Ts0=[trans(A,B,C)|Ts1],
	add_insertions(Trs,Ts1,Ts,Ts2,Fins,Fs,NrS0,NrS,Mrk)
    ).

add_insertions0([],_,Ts,Ts,[],_,[],_,NrS,NrS,_).
add_insertions0([trans(A,B,C)|Trs],A0,Ts0,Ts,Ts2,Fins,Fs,NrS0,NrS1,NrS,Mrk):-
    (	A0==A
    ->	Ts2=[trans(NrS0,B,C)|Ts20],
	add_insertions0(Trs,A0,Ts0,Ts,Ts20,Fins,Fs,NrS0,NrS1,NrS,Mrk)
    ;   add_insertions([trans(A,B,C)|Trs],Ts0,Ts,Ts2,Fins,Fs,NrS1,NrS,Mrk)
    ).

%%%%%%%%%%%%%%
%%% type 2 %%%
%%%%%%%%%%%%%%

%% Type 2 and Type 3 are now provided with a flag to determine
%% whether or not the markeer should be deleted after checking.

rx(marker2(Expr,Change,Flag),Fa) :-
    (	Flag == filter
    ->	fsa_regex:rx(determinize(Expr),Fa0)
    ;	fsa_regex:rx(identity(determinize(Expr)),Fa0)
    ),
    mark2(Change,Flag,Fa0,Fa).

mark2(A0,Flag,Fa1,Fa) :-
    fsa_data:fsa_symbols_decl(Fa1,Sigma),
    dom_module(Sigma,Module),
    Module:regex_notation_to_predicate(A0,A),
    fsa_data:fsa_copy_except(transitions,Fa1,Fa2,Trans0,Trans),
    fsa_data:fsa_copy_except(final_states,Fa2,Fa,Finals,AllStates),
    fsa_data:fsa_states_set(Fa1,AllStates),
    (	Flag == filter
    ->	add_2_filters(Finals,A,Trans1,Trans0)
    ;	add_2_deletions(Finals,A,Trans1,Trans0)
    ),
    sort(Trans1,Trans).

%%%%%%%%%%%%%%
%%% type 3 %%%
%%%%%%%%%%%%%%

rx(marker3(Expr,Change,Flag),Fa) :-
    (	Flag == filter
    ->	fsa_regex:rx(determinize(Expr),Fa0)
    ;	fsa_regex:rx(identity(determinize(Expr)),Fa0)
    ),
    mark3(Change,Flag,Fa0,Fa).

mark3(A0,Flag,Fa1,Fa) :-
    fsa_data:fsa_symbols_decl(Fa1,Sigma),
    dom_module(Sigma,Module),
    Module:regex_notation_to_predicate(A0,A),
    fsa_data:fsa_copy_except(transitions,Fa1,Fa2,Trans0,Trans),
    fsa_data:fsa_copy_except(final_states,Fa2,Fa,Finals,AllStates),
    fsa_data:fsa_states_set(Fa1,AllStates),
    ordsets:ord_subtract(AllStates,Finals,NonFinals),
    (	Flag == filter
    ->	add_2_filters(NonFinals,A,Trans1,Trans0)
    ;	add_2_deletions(NonFinals,A,Trans1,Trans0)
    ),
    sort(Trans1,Trans).

add_2_filters([],_) --> [].
add_2_filters([Final|Fs],A) -->
    [trans(Final,A,Final)],
    add_2_filters(Fs,A).

add_2_deletions([],_) --> [].
add_2_deletions([Final|Fs],A) -->
    [trans(Final,A/[],Final)],
    add_2_deletions(Fs,A).


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%% Longest-Match Concatenate %%%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

macro(lm_concat(Ts),mark_boundaries(Domains) o ConcatTs) :-
   extract_domains(Ts,Domains),
   concatT(Ts,ConcatTs).

extract_domains([],[]).
extract_domains([F|R0],[domain(F)|R]) :-
   extract_domains(R0,R).

concatT([],[]).
concatT([T|Ts], 
        [T,mark x []|RestConcat]) :-
   concatT(Ts,RestConcat).

macro(mark, # ).

%% mark_boundaries
%% ---------------
%% This defines the markup transducer. As an example try:
%%
%% mark_boundaries([a1,a2,a3])
%% 
%% Then try out the transducer with the string (set option 'chars' to
%% on first):
%% 
%% topological
%%
%% The result you should get is:
%% 
%% top#o#logical
%%
%% The marker used is the letter "#". The placement of the two #'s
%% indicates that a1 matched against "top", a2 matched against "o"
%% and a3 matched against "logical".

macro(mark_boundaries(L),[Exp,[] x mark]):-
   boundaries(L,Exp0),
   greed(L,Exp0,Exp).

%% boundaries
%% ----------
%% The markup proceeds in a generate-and-test fashion. The
%% "boundaries" macro is a generator, which guesses a markup for the
%% string.

macro(boundaries(L),Exp) :-
   boundaries(L,Exp).

boundaries([H|T],Output) :-
    boundaries(T,H,Output).

boundaries([],F,[F]).
boundaries([H|R0],F,[F, [] x mark |R]) :-
   boundaries(R0,H,R).

%% greed(ListOfRegexes,Alphabet,Exp2BFiltered,ComposedFilters)
%% -----------------------------------------------------------
%% The ComposedFilters will check a marked-up string of the form
%% x1mx2m...xn, where x1 ... xn are sequences and 'm' is the
%% character 'm' used as a marker. Each x_i is accepted by
%% Regex_i. The filters ensure that, going from left to right through
%% ListOfRegexes, each Regex has applied as greedily as possible in
%% marking up x1 ... xn.
%%
%% As an example, for the goal greed([a1,a2,a3],regex,Result),
%% Result will be bound to:
%%
%%                            regex 
%%
%%                              o 
%%
%%  identity( ~ [ignore_spx(a1,#),ignore_s(a2,#),ignore_s(a3,#)]))
%%
%%                               o 
%%
%% identity( ~ [non_markers(a1),#,ignore_spx(a2,#),ignore_s(a3,#)])) 
%%
%% Filter 1 forces a1 to be as greedy as possible, while Filter 2
%% forces a2 to be as greedy as possible. 

greed(L,Exp2BFiltered,ComposedFilters) :-
   aux_greed(L,[],Filters),
   compose_filters(Filters,Exp2BFiltered,ComposedFilters).

aux_greed([H|T],Front,Filters):-
    aux_greed(T,H,Front,Filters,_CurrentFilter).

aux_greed([],F,_,[],[ignore(F,mark)]).
aux_greed([H|R0],F,Front,[~ L1|R],[ignore(F,mark)|R1]) :-
   lists:append(Front,[ignorex_1(F,mark)|R1],L1),
   lists:append(Front,[F,mark],NewFront),
   %% R is for the rest of the filters to be constructed
   %% R1 is for the rest of the current filter being constructed
   aux_greed(R0,H,NewFront,R,R1).

%% extension '_1': ignore at least one instance
macro(ignorex_1(E1,E2),
   range(E1 o [[? *,[] x E2]+,? +])#).

%% compose_filters(Patterns,Exp2BFiltered,ComposedFilters)
%% ------------------------------------------------------
%% Patterns is a list of patterns to be filtered out by
%% ComposedFilters.

compose_filters([], SoFar, SoFar). %% :- !.
compose_filters([F|R], SoFar, Composed) :-
   compose_filters(R,(SoFar o identity(F)),Composed).

%% (this is an example from Jeffrey Friedl's Mastering Regular
%% Expressions, O'Reilly, 1997, page 177. 
%% the objective being to mimic a `longest match' preference')
%%
%% This was implemented as one part of an algorithm for compiling
%% an extended form of rewrite rules into transducers. See:
%% 
%%     Dale Gerdemann, Gertjan van Noord, "Transducers From Rewrite
%%     Rules with Backreferences" EACL 99.
%%
%% a1, a2, a3
%% ----------
%% This defines the three regexes used in Jeffrey Friedl's example
%% on p. 117. The idea is that the string:
%%   
%%     t o p o l o g i c a l
%%
%% can be partitioned in various ways into x1.x2.x3 such that x1
%% is recognized by a1, x2 by a2 and x3 by a3. The goal is to find 
%% the partitioning that allows a1 to match as long a string as
%% possible, with a2 matching the maximum possible of what's left
%% and so on. 
%% If we simply match "t o p o l o g i c a l" against [a1,a2,a3], we
%% get a yes/no answer, with no clue as to what portion of the string
%% was matched by a1, a2 and a3. So instead what we will do is to use
%% the sequence [a1,a2,a3] to build a transducer which inserts some
%% kind of marker after the portion of the string matched by a1 and
%% a2. Filters are defined to ensure that this transducer will only
%% output the "longest-match" partitioning.

%%%%%%%%%%%%%
%%% Tests %%%
%%%%%%%%%%%%%

macro(a1,minimize({[t,o],[t,o,p]})).
macro(a2,minimize({[o],[p,o,l,o]}^)).
macro(a3,minimize({[g,i,c,a,l],[o^,l,o,g,i,c,a,l]})).

macro(t1,minimize({[t,o] x to,[t,o,p] x top})).
macro(t2,minimize({[] x epsilon, o x o,[p,o,l,o] x polo})).
macro(t3,minimize({[g,i,c,a,l] x gical,
                     [o,l,o,g,i,c,a,l] x ological,
                     [l,o,g,i,c,a,l] x logical})).

macro(test0,lm_concat([t1,t2,t3])).


%% ord_member(+Elt, +Set, -Bool)
%% Bool is yes when Elt is a member of Set, otherwise it's no
ord_member(Elt,Set,Bool):-
    ord_member1(Set,Elt,Bool).

ord_member(<,_,_,no).
ord_member(=,_X,_Es,yes).
ord_member(>,X,Es,YesNo) :-
    ord_member1(Es,X,YesNo).

ord_member1([],_,no).
ord_member1([E|Es],X,YesNo) :-
    compare(C,X,E),
    ord_member(C,X,Es,YesNo).


%% or:
%% fsa -v -aux replace_nm -r 'range(word(topological) o test0)' \
%%           | fsa symbol_separator=32 -p


