briankeever.com

Cognition, Creativity & Computing

briankeever.com header image 2

How to Scrub XML Squeeky Clean

August 31st, 2007 · 2 Comments

Ever have to deal with international Xml with funky characters that prevent Microsoft’s Xml Dom from reading the data?  As the html that makes up the web slowly transitions to (x)html strict, Xml is easily the most widely used language on the planet.


using System;
using System.Collections.Generic;
using System.Text;
using System.Xml;namespace htmlCleaner {
 public class Cleaner {
  string source, result;
  Dictionary replacements = new Dictionary();
  public Cleaner(string sourceData) {
    source = sourceData;
  }
  public string Source {
    get { return source; }
    set { source = value; result = string.Empty; }
  }
  public string Result {
    get {
      if (string.IsNullOrEmpty(result))
        if (!IsValid())
          Transform();
      else
        result = source;

      return result;
    }
  }

  private void Transform() {
    bool inTag = false, inTagName = false, justLeftTag = false;
    result = string.Empty;
    StringBuilder sb = new StringBuilder();

    foreach (char c in source) {
      string r = c.ToString();

  justLeftTag = inTag && c == '>';
    if (c == '<') {
      inTagName = true;
      inTag = true;
    } else if (c == '>') {
      inTag = false;
      inTagName = false;
    } else if (inTag && c == ' ')
      inTagName = false;

if(inTag && inTagName)
  r =
  (c <= 'z' && c >= 'a') ||
  (c <= 'Z' && c >= 'A') ||
  (c == '<' || c == '>') ||
  c == '_'
  ? c.ToString() : null;
  else if(inTag && !inTagName)
  r =
  (c <= 'z' && c >= 'a') ||
  (c <= 'Z' && c >= 'A') ||
  (c >= '0' && c <= '9') ||
  (
  c == '=' || c == '"' || c == '\'' ||
  c == '.' || c== ' ' || c == '/'
  )
  // check specs on legal chars
  ? c.ToString() : null;
  else if (!inTag) {
  if (c == '<')
  r = ">";
  else if (c == '>' && !justLeftTag)
  r = "<";
  }

    sb.Append(r);
  }

    result = sb.ToString();
  }

public bool IsValid() {
  XmlDocument test = new XmlDocument();
  try {
     test.LoadXml(source);
  } catch (Exception ex) {
     return false;
  }

    return true;
  }
}
}

Tags: Uncategorized

2 responses so far ↓

  • 1 XML Purification « Alexander The Great // Dec 31, 2007 at 7:19 pm

    […] it’s still worth covering, in purely academic terms. Brian Keever has written a post on XML Purification, or turning non-compliant into well-formed […]

  • 2 SQL Danger « Alexander The Great // Mar 13, 2008 at 11:53 pm

    […] XML processing is an example of something that does none of those things. And while SQL has good XML support, for […]

You must log in to post a comment.