`
dcaoyuan
  • 浏览: 299058 次
社区版块
存档分类
最新评论

Parse JSON to xmerl Compitable XML Tree via A Simple XML State Machine

阅读更多

Updated Aug 16: Fix bugs when json is an array. Add a 'json:root' element always since valid xml should have a root. Remove 'obj' tag that is not necessary.

Updated Aug 15: A more complete json_parser.erl. Thanks for tonyg's beautiful work, fixed some bugs.

Updated Aug 5: rewrote json_parser.erl base on tonyg's RFC4627 implementation, fixed some bugs.

In my previous blog: A Simple XML State Machine Accepting SAX Events to Build xmerl Compitable XML Tree: icalendar demo, I wrote a simple state machine to parse icalendar to xmerl compitable XML tree. This time, I'll use this state machine to parse a JSON expression to xmerl compitable XML tree, the work is fairly simple:

%%---------------------------------------------------------------------------
%% Copyright (c) 2007 Tony Garnock-Jones <tonyg@kcbbs.gen.nz>
%% Copyright (c) 2007 LShift Ltd. <query@lshift.net>
%% Copyright (c) 2007 LightPole, Inc. 
%%
%% Permission is hereby granted, free of charge, to any person
%% obtaining a copy of this software and associated documentation
%% files (the "Software"), to deal in the Software without
%% restriction, including without limitation the rights to use, copy,
%% modify, merge, publish, distribute, sublicense, and/or sell copies
%% of the Software, and to permit persons to whom the Software is
%% furnished to do so, subject to the following conditions:
%%
%% The above copyright notice and this permission notice shall be
%% included in all copies or substantial portions of the Software.
%%
%% THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
%% EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
%% MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
%% NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
%% BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
%% ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
%% CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
%% SOFTWARE.
%%---------------------------------------------------------------------------
%%
-module(json_parser).

-define(stateMachine, fun xml_sm:state/2).

-define(JsonNSUri,   "http://www.lightpole.net/xmlns/1.0").
-define(JsonNSAtrr,  {'xmlns:json', ?JsonNSUri}).
-define(JsonNSRoot,  'json:root').
-define(JsonNSArray, 'json:array').

-record(context, {machine,
                  qname}).

-export([parse_to_xml/1,
         parse_to_poet/1]).

-export([test/0]).

parse_to_xml(Data) ->
    parse(Data, #context{machine = fun xml_sm:state/2}).
        
parse_to_poet(Data) ->
    parse(Data, #context{machine = fun poet_sm:state/2}).

parse(Bin, Context) when is_binary(Bin) ->
    parse(binary_to_list(Bin), Context);
parse(Str, #context{machine=MachineFun}=Context) ->
    State1 = MachineFun({startDocument}, undefined),
    State2 = parse_root(skip_ws(Str), State1, Context),
    _State = MachineFun({endDocument}, State2).

%% since a valid xml should have a root element, we add one here.
parse_root([${|T], State, #context{machine=MachineFun}=Context) ->
    State1 = MachineFun({startElement, ?JsonNSUri, root, ?JsonNSRoot, [?JsonNSAtrr]}, State),
    Context1 = Context#context{qname = undefined},
    {_Rest, State2} = parse_object(skip_ws(T), State1, Context1),
    _State = MachineFun({endElement, ?JsonNSUri, root, ?JsonNSRoot}, State2); 
parse_root([$[|T], State, #context{machine=MachineFun}=Context) ->
    State1 = MachineFun({startElement, ?JsonNSUri, root, ?JsonNSRoot, [?JsonNSAtrr]}, State),
    Context1 = Context#context{qname = ?JsonNSArray},
    {_Rest, State2} = parse_array(skip_ws(T), State1, Context1),
    _State = MachineFun({endElement, ?JsonNSUri, root, ?JsonNSRoot}, State2). 

parse_object([$}|T], State, _Context) ->
    {T, State};
parse_object([$,|T], State, Context) ->
    parse_object(skip_ws(T), State, Context);
parse_object([$"|T], State, #context{machine=MachineFun}=Context) ->
    {Rest, ObjNameStr} = parse_string(skip_ws(T), []),
    ObjName = list_to_atom(ObjNameStr),
    Context1 = Context#context{qname = ObjName},
    [$:|T1] = skip_ws(Rest),
    {Rest1, State1} = 
        case skip_ws(T1) of
            [$[|T2] ->
                %% the value is array, we'll create a list of elements named as this 'ObjName'
                parse_array(skip_ws(T2), State, Context1);
            _ ->
                StateX1 = MachineFun({startElement, "", ObjName, ObjName, []}, State),
                {RestX, StateX2} = parse_value(skip_ws(T1), StateX1, Context1),
                StateX3 = MachineFun({endElement, "", ObjName, ObjName}, StateX2),
                {RestX, StateX3}
        end,
    parse_object(skip_ws(Rest1), State1, Context1).

parse_array([$]|T], State, _Context) ->
    {T, State};
parse_array([$,|T], State, Context) ->
    parse_array(skip_ws(T), State, Context);
parse_array(Chars, State, #context{machine=MachineFun, qname=QName}=Context) ->
    State1 = MachineFun({startElement, "", QName, QName, []}, State),
    {Rest, State2} = parse_value(Chars, State1, Context),
    State3 = MachineFun({endElement, "", QName, QName}, State2),
    parse_array(skip_ws(Rest), State3, Context).

parse_value([], State, _Context) -> 
    {[], State};
parse_value("true"++T, State, #context{machine=MachineFun}) -> 
    State1 = MachineFun({characters, "true"}, State),
    {T, State1};
parse_value("false"++T, State, #context{machine=MachineFun}) ->
    State1 = MachineFun({characters, "false"}, State),
    {T, State1};
parse_value("null"++T, State, #context{machine=MachineFun}) ->
    State1 = MachineFun({characters, "null"}, State),
    {T, State1};
parse_value([$"|T], State, #context{machine=MachineFun}) -> 
    {Rest, Value} = parse_string(T, []),
    State1 = MachineFun({characters, Value}, State),
    {Rest, State1};
parse_value([${|T], State, Context) -> 
    parse_object(skip_ws(T), State, Context);
parse_value([$[|T], State, Context) -> 
    parse_array(skip_ws(T), State, Context);
parse_value(Chars, State, #context{machine=MachineFun}) -> 
    {Rest, Value} = parse_number(skip_ws(Chars), []),
    State1 = MachineFun({characters, Value}, State),
    {Rest, State1}.



parse_string([$"|T], Acc) ->
    {T, lists:reverse(Acc)};
parse_string([$\\, Key|T], Acc) ->
    parse_escaped_char(Key, T, Acc);
parse_string([H|T], Acc) ->
    parse_string(T, [H|Acc]).

parse_escaped_char($b,  Rest, Acc) -> parse_string(Rest, [8|Acc]);
parse_escaped_char($t,  Rest, Acc) -> parse_string(Rest, [9|Acc]);
parse_escaped_char($n,  Rest, Acc) -> parse_string(Rest, [10|Acc]);
parse_escaped_char($f,  Rest, Acc) -> parse_string(Rest, [12|Acc]);
parse_escaped_char($r,  Rest, Acc) -> parse_string(Rest, [13|Acc]);
parse_escaped_char($/,  Rest, Acc) -> parse_string(Rest, [$/|Acc]);
parse_escaped_char($\\, Rest, Acc) -> parse_string(Rest, [$\\|Acc]);
parse_escaped_char($",  Rest, Acc) -> parse_string(Rest, [$"|Acc]);
parse_escaped_char($u,  [D0, D1, D2, D3|Rest], Acc) ->
    parse_string(Rest, [(digit_hex(D0) bsl 12) +
      (digit_hex(D1) bsl 8) +
      (digit_hex(D2) bsl 4) +
      (digit_hex(D3))|Acc]).

digit_hex($0) -> 0;
digit_hex($1) -> 1;
digit_hex($2) -> 2;
digit_hex($3) -> 3;
digit_hex($4) -> 4;
digit_hex($5) -> 5;
digit_hex($6) -> 6;
digit_hex($7) -> 7;
digit_hex($8) -> 8;
digit_hex($9) -> 9;
digit_hex($A) -> 10;
digit_hex($B) -> 11;
digit_hex($C) -> 12;
digit_hex($D) -> 13;
digit_hex($E) -> 14;
digit_hex($F) -> 15;
digit_hex($a) -> 10;
digit_hex($b) -> 11;
digit_hex($c) -> 12;
digit_hex($d) -> 13;
digit_hex($e) -> 14;
digit_hex($f) -> 15.

finish_number(Rest, Acc) ->
    Value = lists:reverse(Acc),
%    Value = 
%        case catch list_to_integer(Str) of
%      {'EXIT', _} -> list_to_float(Str);
%      Number -> Number
%        end,
    {Rest, Value}.

parse_number([], _Acc) ->
    exit(syntax_error);
parse_number([$-|T], Acc) ->
    parse_number1(T, [$-|Acc]);
parse_number(Rest, Acc) ->
    parse_number1(Rest, Acc).

parse_number1(Rest, Acc) ->
    {Acc1, Rest1} = parse_int_part(Rest, Acc),
    case Rest1 of
  [] -> finish_number([], Acc1);
  [$.|More] ->
            {Acc2, Rest2} = parse_int_part(More, [$.| Acc1]),
            parse_exp(Rest2, Acc2, false);
        _ ->
            parse_exp(Rest1, Acc1, true)
    end.


parse_int_part([], Acc) ->
    {Acc, []};
parse_int_part([Ch|Rest], Acc) ->
    case is_digit(Ch) of
  true  -> parse_int_part(Rest, [Ch | Acc]);
  false -> {Acc, [Ch | Rest]}
    end.

parse_exp([$e|T], Acc, NeedFrac) ->
    parse_exp1(T, Acc, NeedFrac);
parse_exp([$E|T], Acc, NeedFrac) ->
    parse_exp1(T, Acc, NeedFrac);
parse_exp(Rest, Acc, _NeedFrac) ->
    finish_number(Rest, Acc).

parse_exp1(Rest, Acc, NeedFrac) ->
    {Acc1, Rest1} = parse_signed_int_part(Rest, if  NeedFrac -> [$e, $0, $.|Acc];
                true -> [$e|Acc]
            end),
    finish_number(Rest1, Acc1).

parse_signed_int_part([$+|T], Acc) ->
    parse_int_part(T, [$+|Acc]);
parse_signed_int_part([$-|T], Acc) ->
    parse_int_part(T, [$-|Acc]);
parse_signed_int_part(Rest, Acc) ->
    parse_int_part(Rest, Acc).

is_digit(C) when is_integer(C) andalso C >= $0 andalso C =< $9 -> true;
is_digit(_) -> false.
    

skip_ws([H|T]) when H =< 32 ->
    skip_ws(T);
skip_ws(Chars) ->
    Chars.



test() ->
    Text1 = "{\"firstname\":\"Caoyuan\", \"iq\":\"150\"}",
    {ok, Xml1} = parse_to_xml(Text1),
    XmlText1 = lists:flatten(xmerl:export_simple([Xml1], xmerl_xml)),
    io:fwrite(user, "Parsed XML: ~n~p~n", [XmlText1]),
    {ok, Poet1} = parse_to_poet(Text1),
    io:fwrite(user, "Parsed POET: ~n~p~n", [Poet1]),

    Text2 = "[{\"firstname\":\"Caoyuan\", \"iq\":\"150\"}, 
              {\"firstname\":\"Haobo\", \"iq\":150}]", 
    {ok, Xml2} = parse_to_xml(Text2),
    XmlText2 = lists:flatten(xmerl:export_simple([Xml2], xmerl_xml)),
    io:fwrite(user, "Parsed: ~n~p~n", [XmlText2]),

    Text = "
{\"businesses\": [{\"address1\": \"650 Mission Street\",
                   \"address2\": \"\",
                   \"avg_rating\": 4.5,
                   \"categories\": [{\"category_filter\": \"localflavor\",
                                     \"name\": \"Local Flavor\",
                                     \"search_url\": \"http://lightpole.net/search\"}],
                   \"city\": \"San Francisco\",
                   \"distance\": 0.085253790020942688,
                   \"id\": \"4kMBvIEWPxWkWKFN__8SxQ\",
                   \"latitude\": 37.787185668945298,
                   \"longitude\": -122.40093994140599},
                  {\"address1\": \"25 Maiden Lane\",
                   \"address2\": \"\",
                   \"avg_rating\": 5.0,
                   \"categories\": [{\"category_filter\": \"localflavor\",
                                     \"name\": \"Local Flavor\",
                                     \"search_url\": \"http://lightpole.net/search\"}],
                   \"city\": \"San Francisco\",
                   \"distance\": 0.23186808824539185,
                   \"id\": \"O1zPF_b7RyEY_NNsizX7Yw\",
                   \"latitude\": 37.788387,
                   \"longitude\": -122.40401}]} ",
    {ok, Xml} = parse_to_xml(Text),
    %io:fwrite(user, "Xml Tree: ~p~n", [Xml]),
    XmlText = lists:flatten(xmerl:export_simple([Xml], xmerl_xml)),
    io:fwrite(user, "Parsed: ~n~p~n", [XmlText]),
    Latitude1 = xmerl_xpath:string("/lp:root/businesses[1]/latitude/text()", Xml),
    io:format(user, "Latitude1: ~p~n", [Latitude1]).

The result will be something like:

<?xml version="1.0"?>
<json:root xmlns:json="http://www.lightpole.net/xmlns/1.0">
  <businesses>
    <address1>650 Mission Street</address1>
    <address2></address2>
    <avg_rating>4.5</avg_rating>
    <categories>
      <category_filter>localflavor</category_filter>
      <name>Local Flavor</name>
      <search_url>http://lightpole.net/search</search_url>
    </categories>
    <city>San Francisco</city>
    <distance>0.085253790020942688</distance>
    <id>4kMBvIEWPxWkWKFN__8SxQ</id>
    <latitude>37.787185668945298</latitude>
    <longitude>-122.40093994140599</longitude>
  </businesses>
  <businesses>
    <address1>25 Maiden Lane</address1>
    <address2></address2>
    <avg_rating>5.0</avg_rating>
    <categories>
      <category_filter>localflavor</category_filter>
      <name>Local Flavor</name>
      <search_url>http://lightpole.net/search</search_url>
    </categories>
    <city>San Francisco</city>
    <distance>0.23186808824539185</distance>
    <id>O1zPF_b7RyEY_NNsizX7Yw</id>
    <latitude>37.788387</latitude>
    <longitude>-122.40401</longitude>
  </businesses>
</root>

Now you fecth element by:

> [Latitude1] = xmerl_xpath:string("/json:root/businesses[1]/latitude/text()", Xml),
> Latitude1#xmlText.value.
"37.787185668945298"

Next time, I'll write a simple Erlang Data state machine, which will parse icalendar and json to simple Erlang Lists + Tuples.

The code of xml_sm.erl can be found in my previous blog.

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics