Sources
Delphi Russian Knowledge Base
DRKB - это самая большая и удобная в использовании база знаний по Дельфи в рунете, составленная Виталием Невзоровым

Parsing XML

01.01.2007
{
 
Here I will show one way to parse XML document
 
The main concept of XML is using containers for XML objects -
so we will use Tree concept while building our XML object from XML document.
 
XML text uses containers (<TAG ... >...</TAG>) or simple definitions (<TAG ... />)
in each TAG we can use parameters (<TAG key1="value1" key2="value2">... <SIMPLE key3="value3"> ...</TAG>)
 
Finally we will have an array of objects, describing XML tags. Every object of this
class will have an array of children if needed, and a hash to describe properties of it.
 
 
For example if we have a text
 
<UL name="xxx">
  <LI name="xxx1"/>
  <LI name="xxx2"/>
  <LI name="xxx3"/>
  <LI name="xxx4"/>
</UL>
 
we will have one root object (named "UL") in wich we will have 4 children
(named "LI" with different sets of properties - from "NAME"="xxx1" to "NAME"="xxx4")
 
This is not a trivial task - so we will make a unit to solve this...
I will try to comment some here...
 
if you have any comments for this unit - write to me: sunworx@mail.ru; yz@infoteh.ru
 
}
unit YZXMLParser;
 
interface
 
uses
  SysUtils, ComCtrls;
 
type 
  THashElement = record
    Key, Value: string;
  end;
 
type 
  THashElementArr = array of THashElement;
 
 
  // here  we declare a THash class to use in our parser
  // The concept of THash is to retreive named values from an array
  // Hash is an array where index is a string (example V[Key]=value,
  // whehe Key and Value are of type string)
 
  // The main purpose of this class is to rerurn a value of a String-named key
  //(example: s:=hash['someValue'])
 
 
  // the description of a hash element we use
 
type 
  THash = class(TObject)
  private
    Arr: THashElementArr;
    function GetValue(Key: string): string;
    procedure SetValue(Key: string; const VValue: string);
    function GetKeys: StrArr;
    function GetValues: StrArr;
    function GetCount: Integer;
    function Getempty: Boolean;
  public
    property Value[Key: string]: string read GetValue write SetValue; default;
    property Values: StrArr read GetValues;
    property Keys: StrArr read GetKeys;
    property Count: Integer read GetCount;
    property Empty: Boolean read Getempty;
    procedure Clear;
    constructor Create;
    destructor Destroy; override;
  end;
 
  TYZHash = THash;
 
 
type
 
  // Here we declare some definitions for our parser to know what
  // identifier we would receive next in our text
  // these  values will be used in the result of WhatNext() function which will scan text for keys
 
  TYZXMLMarker = (xmlOpenTag, xmlCloseTagShort, xmlCloseTag, xmlCloseTagLong,
    xmlEOF, xmlIdentifier, xmlunknown); / *
 
  Because we use recursive definition of our class(as TreeView, where we declare children of
    the same type in opur type 
    declaration) we must use forward declaration
    * /
 
 
  // The definition of a TAG class
 
 
  TYZXMLTag  = class;
  TYZXMLTags = array of TYZXMLTag;
 
  TYZXMLTag = class(TObject)
  private
    FData: TYZHash;
    FParent: TYZXMLTag;
    FName: string;
 
    function GetValue(AName: string): string;
    procedure SetName(const Value: string);
    procedure SetValue(AName: string; const Value: string);
    function GetCount: Integer;
    function GetValueNames: strarr;
 
  public
    Children: TYZXMLTags; // these are our child nodes
    Text: string;
 
 
    property Name: string read FName write SetName; // name of a tag
    property Values[AName: string]: string read GetValue write SetValue;
      default; // values of properties of a tag (hash values)
    property ValueNames: strarr read GetValueNames;
    // array of strings returniong names of all props of this tag
    property Count: Integer read GetCount;
    // a count of children of a tag (if this tag is a container)
 
    function SkipSpaces(var AData: string; var APos: Integer;
      RememberBreaks: Boolean = False): Char;
    // internal. for skip spaces (also CR or LF or other non-text chars) while parsing text
 
    function ParseValue(var AData: string; var APos: Integer): Boolean;
    // parse value (calling when found a parameter of a tag)
    function ParseName(var AData: string; var APos: Integer): Boolean;
    // parse key of parameter in a tag
 
    // these two procs used to parse any text found while parsing XML
    function ParseString(var AData: string; var APos: Integer;
      RememberBreaks: Boolean = False): string;
    function ParseQuotedString(var AData: string; var APos: Integer;
      QIndef: Char = '"'): string;
 
    // returnes the type of next identifier in XML
    function WhatNext(var AData: string; var APos: Integer;
      var ANext: Integer; RememberBreaks: Boolean = False): TYZXMLMarker;
 
 
    // This is a main procedure of our class - AData is a string,
    // containing all XML data (you can use TMemo.Text, for example, as a parameter of AData)
    function ParseXML(var AData: string; var APos: Integer): Boolean;
 
    // This function returnes a text string, built based on data, stored in an object.
    function GenerateXML(var AData: string; ATab: string = ''): Boolean;
 
    // returnes char from string at specified pos (#0 if not in range)
    function CharAt(var S: string; APos: Integer): Char;
 
 
    function TagNameExists(AName: string): Boolean;
 
    // Adds a child to children array of a current tag
 
    function AddChild: TYZXMLTag;
 
    // Initializes current tag and deletes all existing children
    procedure Clear; virtual;
 
    constructor Create(AParent: TYZXMLTag); virtual;
    destructor Destroy; virtual;
  end;
 
 
type 
  TYZXMLParser = class(TYZXMLTag)
  private
    Header: TYZHash;
    procedure _BuildTreeView(ATreeView: TTreeView; ANode: TTreeNode; ATag: TYZXMLTag);
  public
    property HeaderValues: TYZHash read Header;
 
    procedure BuildTreeView(ATreeView: TTreeView);
    function Parse(AData: string): Boolean;
    function Generate(var AData: string): Boolean;
    constructor Create;
    destructor Destroy;
  end;
 
implementation
 
//==============================================================================
 
{ TYZXMLTag }
 
function TYZXMLTag.AddChild: TYZXMLTag;
begin
  setlength(children, Length(children) + 1);
  Result := TYZXMLTag.Create(Self);
  children[Length(children) - 1] := Result;
end;
 
//------------------------------------------------------------------------------
 
procedure TYZXMLTag.Clear;
var 
  i: Integer;
begin
  for i := 0 to Count - 1 do if children[i] <> nil then Children[i].Destroy;
  setlength(children, 0);
  FData.Clear;
  Text := '';
end;
 
//------------------------------------------------------------------------------
 
constructor TYZXMLTag.Create(AParent: TYZXMLTag);
begin
  inherited Create;
  FData   := TYZHash.Create;
  FParent := AParent;
  Clear;
end;
 
//------------------------------------------------------------------------------
 
destructor TYZXMLTag.Destroy;
begin
  Clear;
  FData.Destroy;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.GetCount: Integer;
begin
  Result := Length(children);
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.GetValue(AName: string): string;
begin
  Result := FData[AName];
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.ParseName(var AData: string; var APos: Integer): Boolean;
begin
  Result := False;
  FName  := ParseString(AData, APos);
  if fname = '' then Exit;
  Result := True;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.ParseQuotedString(var AData: string; var APos: Integer;
  QIndef: Char = '"'): string;
var 
  i: Integer;
  skipnext: Boolean;
  z: Char;
begin
  Result := '';
  if CharAt(AData, APos) <> QIndef then Exit;
  i        := apos;
  skipnext := True;
  repeat
    if not skipnext then
    begin
      if charat(AData, I) = '\' then SkipNext := True 
      else
      begin
        z := charat(AData, I);
        if (Z = QIndef) or (z = #0) then
        begin
          Result := Copy(AData, aPos + 1, I - APos - 1);
          //          result:=exch(result,'\','');
          APos := I + 1;
          Exit;
        end;
      end;
    end 
    else 
      skipnext := False;
    Inc(i);
  until False;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.ParseString(var AData: string; var APos: Integer;
  RememberBreaks: Boolean = False): string;
const 
  extsym: string = '=<>;?*/';
var 
  nxt: Char;
  x1, x2, i: Integer;
begin
  Result := '';
  nxt    := SkipSpaces(AData, APos, RememberBreaks);
  if nxt = #0 then Exit;
  if (nxt = '"') or (nxt = '''') then 
  begin 
    Result := ParseQuotedString(AData, APos); 
    Exit; 
  end;
  x1  := APos;
  i   := x1;
  nxt := CharAt(AData, i);
  while ((Ord(nxt) <= 32) or (Pos(nxt, extsym) > 0)) and (nxt <> #0) do 
  begin 
    Inc(i); 
    nxt := CharAt(AData, i); 
  end;
  APos := i;
  X1 := APos;
  while (Ord(nxt) > 32) and (Pos(nxt, extsym) <= 0) do 
  begin 
    Inc(i); 
    nxt := CharAt(AData, i); 
  end;
  x2 := i - x1;
  Result := Copy(AData, x1, x2);
  APos := i;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.ParseValue(var AData: string; var APos: Integer): Boolean;
var 
  n, v: string;
  i, x: Integer;
begin
  Result := False;
  n := parseString(AData, APos);
  if n = '' then Exit;
  if skipspaces(AData, APos) <> '=' then Exit;
  Inc(apos);
  V := parseString(AData, APos);
  fdata[n] := dequote(v);
  Result := True;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.ParseXML(var AData: string; var APos: Integer): Boolean;
var 
  N: TYZXMLMarker;
  nxt: Integer;
  isLong: Boolean;
  inTag: Boolean;
begin
  isLong := False;
  Result := False;
  Clear;
  if WhatNext(AData, APos, nxt) <> xmlOpenTag then Exit;
  APos := nxt;
  if WhatNext(AData, APos, nxt) <> xmlIdentifier then Exit;
  Result := ParseName(AData, APos);
  if not Result then Exit;
  intag  := True;
  Result := False;
  while True do
  begin
    N := WhatNext(AData, APos, nxt, (not intag and islong and (Count > 0)));
    case N of
      xmlEOF: Exit;
      xmlCloseTagLong: 
        begin 
          Result := True; 
          if islong then APos := nxt; 
          if (Text <> '') and (Count > 0) then 
          begin 
            Text := exch(Text, #13#10#13#10, #13#10); 
          end;
 
          Exit; 
        end;
      xmlCloseTagShort: 
        begin 
          Result := (not IsLong) and intag; 
          if Result then APos := nxt; 
          Exit; 
        end;
      xmlOpenTag: 
        begin 
          if islong then Result := AddChild.ParseXML(AData, APos) 
          else 
          begin 
            Result := False; 
            Exit; 
          end; 
          if not Result then Exit; 
        end;
      xmlCloseTag: 
        begin 
          IsLong := True; 
          APos   := nxt; 
          intag  := False; 
        end;
      xmlIdentifier: 
        begin 
          if intag then parsevalue(AData, APos) 
          else 
            Text := Text + ParseString(AData, APos, True) 
        end;
      xmlUnknown: 
        begin 
          Result := True; 
          Exit; 
        end;
    end;
  end;
end;
 
//------------------------------------------------------------------------------
 
procedure TYZXMLTag.SetName(const Value: string);
begin
  FName := Value;
end;
 
//------------------------------------------------------------------------------
 
procedure TYZXMLTag.SetValue(AName: string; const Value: string);
begin
  FData[AName] := Value;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.SkipSpaces(var AData: string; var APos: Integer;
  RememberBreaks: Boolean = False): Char;
var 
  L: Integer;
  P: Char;
begin
  L := Length(AData);
  while APos <= L do
  begin
    P := AData[APos];
    if Ord(p) > 32 then 
    begin 
      Result := p; 
      Exit; 
    end 
    else if rememberbreaks then
    begin
      if Pos(p, #13#9' ') > 0 then
        Text := Text + ' ';
    end;
    Inc(APos);
  end;
  Result := #0;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.CharAt(var S: string; APos: Integer): Char;
begin
  Result := #0;
  if (Length(s) < APos) or (apos < 1) then Exit;
  Result := s[APos];
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.WhatNext(var AData: string; var APos: Integer;
  var ANext: Integer; RememberBreaks: Boolean = False): TYZXMLMarker;
var 
  s: string;
  C: Char;
  P: Integer;
begin
  Result := xmlEOF;
  P := APos;
  C := SkipSpaces(AData, APos);
  P := APos;
  ANext  := P;
  if C = #0 then Exit;
 
  if C = '<' then if CharAt(AData, P + 1) = '/' then
    begin
      Inc(P, 2);
      s := parsestring(AData, P);
      if (uppercase(s) = uppercase(FName)) and (SkipSpaces(AData, P) = '>') then
      begin 
        ANext := P + 1;
        Result := xmlCloseTagLong; 
        Exit; 
      end 
      else
      begin
        if TagNameExists(s) then
        begin
          Result := xmlCloseTagLong;
          ANext := APos;
          Exit;
        end;
        ANext  := P + 1;
        Result := xmlCloseTagLong;
        Exit;
      end;
    end;
 
  if C = '<' then 
  begin 
    ANext := P + 1;
    Result := xmlOpenTag; 
    Exit; 
  end;
 
  if C = '>' then 
  begin 
    ANext := P + 1;
    Result := xmlCloseTag; 
    Exit; 
  end;
  if C = '/' then if CharAt(AData, P + 1) = '>' then 
    begin 
      ANext := P + 2;
      Result := xmlCloseTagShort; 
      Exit;
    end;
  ANext := P;
  parsestring(AData, ANext);
  Result := xmlIdentifier;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.GetValueNames: strarr;
begin
  Result := FData.Keys;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.GenerateXML(var AData: string; ATab: string = ''): Boolean;
var 
  valDelimiter: string;
  spc: string;
  i: Integer;
  a: strarr;
begin
  spc := ATab + #9;
  if FData.Count < 5 then valDelimiter := ' ' 
  else 
    valDelimiter := #13#10 + spc;
  AData := AData + #13#10 + ATab + '<' + FName;
  a     := FData.keys;
  for i := 0 to Length(a) - 1 do
  begin
    AData := AData + valDelimiter + a[i] + ' = "' + EnQuote(values[a[i]]) + '"';
  end;
  if (Count > 0) or (Text <> '') then
  begin
    AData := AData + '>' + Text;
    for i := 0 to Count - 1 do
    begin
      Children[i].GenerateXML(AData, ATab + #9);
    end;
    AData := AData + #13#10 + ATab + '</' + FName + '>';
  end 
  else 
    AData := AData + '/>';
  Result := True;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLTag.TagNameExists(AName: string): Boolean;
begin
  Result := AnsiUpperCase(AName) = AnsiUpperCase(Self.FName);
  if Self.FParent = nil then Exit;
  if not Result then Result := fparent.TagNameExists(AName);
end;
 
//==============================================================================
 
 
{ TYZXMLParser }
 
constructor TYZXMLParser.Create;
begin
  Header := TYZHash.Create;
  inherited Create(nil);
end;
 
//------------------------------------------------------------------------------
 
destructor TYZXMLParser.Destroy;
begin
  inherited;
  Header.Destroy;
end;
 
//------------------------------------------------------------------------------
 
procedure TYZXMLParser.BuildTreeView(ATreeView: TTreeView);
var 
  i: Integer;
begin
  //  clear;
  ATreeView.Items.Clear;
  for i := 0 to Count - 1 do _BuildTreeView(ATreeView, nil, children[i]);
end;
 
//------------------------------------------------------------------------------
 
procedure TYZXMLParser._BuildTreeView(ATreeView: TTreeView; ANode: TTreeNode;
  ATag: TYZXMLTag);
var 
  i: Integer;
  N: TTreeNode;
begin
  N := ATreeView.Items.AddChildObject(ANode, ATag.Name + ' ' + FData['ID'], Pointer(ATag));
  for i := 0 to ATag.Count - 1 do
  begin
    if ATag.children[i] <> nil then _BuildTreeView(ATreeView, N, ATag.children[i]) 
    else 
      ATreeView.Items.AddChild(N, 'nil');
  end;
  N.Expanded := True;
end;
 
//------------------------------------------------------------------------------
 
function TYZXMLParser.Parse(AData: string): Boolean;
var 
  x1, x2, X, i: Integer;
  s: string;
  tmp: TYZXMLTag;
  a: strarr;
  N: TYZXMLMarker;
begin
  X := 1;
  Self.SkipSpaces(AData, X);
  x2 := -1;
  Result := False;
  Clear;
  Header.Clear;
  x1 := Pos('<?', AData);
  if x1 >= X then
  begin
    x2 := Pos('?>', AData);
    if x2 < X then Exit;
    s := uppercase(Copy(AData, x1 + 2, 4));
    if Pos('XML ', s) <> 1 then Exit;
    s   := '<xml ' + Copy(AData, x1 + 6, x2 - x1 - 6) + '/>';
    tmp := TYZXMLTag.Create(nil);
    tmp.ParseXML(s, x);
    a := tmp.ValueNames;
    for i := 0 to Length(a) - 1 do
      Header[a[i]] := tmp.Values[a[i]];
    tmp.Destroy;
    x := x2 + 2;
  end;
  Result := True;
  repeat
    N := whatnext(AData, X, x1);
    case N of
      xmlOpenTag: Result := Result and AddChild.ParseXML(AData, X);
      xmlIdentifier: 
        begin 
          if Text <> '' then Text := Text + ' '; 
          Text := Text + parsestring(AData, X, True); 
        end;
      else 
        Parsestring(AData, X);
    end;
  until skipspaces(adata, x) = #0;
  //  if not result then ShowMessage('Error Parsing: '+inttostr(X));
end;
 
 
 
function TYZXMLParser.Generate(var AData: string): Boolean;
var 
  i: Integer;
  a: strarr;
begin
  Header['Date'] := DateTimeToStr(now);
  a := header.Keys;
 
  AData := '<?xml';
  for i := 0 to Length(a) - 1 do
    AData := AData + ' ' + a[i] + '="' + Header[a[i]] + '"';
 
  AData  := AData + '?>'#13#10 + Text;
  Result := True;
  for i := 0 to Length(children) - 1 do
  begin
    Result := Result and children[i].generatexml(AData);
  end;
end;
 
//==============================================================================
 
 
// procedures of THash class
 
 
//==============================================================================
 
{THASH CLASS}
 
 
procedure THash.Clear;
begin
  SetLength(Arr, 0);
end;
 
constructor THash.Create;
begin
  inherited;
  Clear;
end;
 
//------------------------------------------------------------------------------
 
destructor THash.Destroy;
begin
  Clear;
  inherited;
end;
 
//------------------------------------------------------------------------------
 
function THash.GetCount: Integer;
begin
  Result := Length(Arr);
end;
 
//------------------------------------------------------------------------------
 
function THash.Getempty: Boolean;
begin
  Result := Length(Arr) = 0;
end;
 
function THash.GetKeys: StrArr;
var 
  i: Integer;
begin
  SetLength(Result, Length(arr));
  for i := 0 to Length(Result) - 1 do
    Result[i] := arr[i].Key;
end;
 
//------------------------------------------------------------------------------
 
function THash.GetValue(Key: string): string;
var 
  i: Integer;
  r: Boolean;
begin
  Result := '';
  i      := 0; 
  r      := False;
  while (i < Length(Arr)) and (not r) do
  begin
    if AnsiUpperCase(arr[i].key) = AnsiUpperCase(Key) then 
    begin 
      Result := Arr[i].Value; 
      r := True;
    end;
    i := i + 1;
  end;
end;
 
//------------------------------------------------------------------------------
 
function THash.GetValues: StrArr;
var 
  i: Integer;
begin
  SetLength(Result, Length(arr));
  for i := 0 to Length(Result) - 1 do
    Result[i] := arr[i].Value;
end;
 
//------------------------------------------------------------------------------
 
procedure THash.SetValue(Key: string; const VValue: string);
var 
  i, j: Integer;
  r: Boolean;
  E: THashElementArr;
begin
  if VValue <> '' then
  begin
    i := 0; 
    r := False;
    while (i < Length(Arr)) and not r do
    begin
      if AnsiUpperCase(arr[i].key) = AnsiUpperCase(Key) then 
      begin 
        Arr[i].Value := VValue; 
        r := True;
      end;
      i := i + 1;
    end;
    if not r then 
    begin 
      SetLength(Arr, Length(arr) + 1);
      arr[Length(arr) - 1].Key   := Key;
      arr[Length(arr) - 1].Value := Vvalue; 
    end;
  end;
 
  SetLength(E, Length(Arr));
  for i := 0 to Length(arr) - 1 do E[i] := Arr[i];
  SetLength(arr, 0);
  for i := 0 to Length(E) - 1 do if (E[i].Key <> '') and (E[i].Value <> '') then
    begin
      j := Length(arr);
      setlength(arr, j + 1);
      arr[j] := E[i];
    end;
end;
 
end.