My iOS sharing extension #Shaarli💫 (sourcecode+tests) has to communicate with HTML backends. Scraping data from HTML forms, sending HTTP posts. Now I want to avoid parsing HTML myself, but luckily libxml2 HTMLParser does that and is available on iOS. Giants, shoulders, you know.
Also I want to avoid to hold the complete document in memory, let alone traverse a DOM when I can grab the few things I need from a sax parser while they fly by. I used to get the form data with C/Objective-C and xpath, OMG. Having as few own, non-Swift-managed code as possible for parsing arbitrary content from the internet seems desirable.
So libxml2
for html parsing, sax (streaming or push parser) for slim
footprint and Swift for memory safety and ‘modern’ idioms
(Closures!).
Let’s add libxml2
sax parsing mojo to the Xcode project, following The Red
Queen Coder:
- in build settings look out for “Header Search Paths” and add
$(SDKROOT)/usr/include/libxml2
- to “Other Linker Flags.”
-lxml2
- to “Objective-C Bridging Header” a
foo/Bridging.h
, see below.
The actual parsing finally turned out simpler than expected.
Especially the closures wiring the sax callbacks in line 52 look nice, don’t they?
Happy parsing!
Addendum
Bridging.h
//
// Bridging header to access libxml2 html parsing from Swift.
// http://mro.name/ShaarliOS
//
// First adjust some settings as described by http://redqueencoder.com/wrapping-libxml2-for-swift/
//
// * add to Xcode build settings “Header Search Paths”:
// $(SDKROOT)/usr/include/libxml2
// * add to Xcode build settings "Other Linker Flags."
// -lxml2
//
// Also: https://github.com/SonoPlot/Swift-libxml/blob/master/LibXMLWrapperExample/LibXMLWrapperExample/Bridging-Header.h
#import <libxml/HTMLparser.h>
#import <libxml/xmlerror.h>
HtmlFormParser.swift
//
// HtmlFormParser.swift
// http://mro.name/ShaarliOS
//
// Created by Marcus Rohrmoser on 09.06.19.
// Copyright © 2019 Marcus Rohrmoser mobile Software. All rights reserved.
//
import Foundation
typealias HtmlFormDict = [String:String]
// uses libxml2 graceful html parsing
func findHtmlForms(_ body:Data?, _ encoding:String?) -> [String:HtmlFormDict] {
return HtmlFormParser().parse(body)
}
// turn a nil-terminated list of unwrapped name,value pairs into a dictionary.
// expand abbreviated (html5) attribute values.
internal func atts2dict(_ atts: (Int) -> String?) -> HtmlFormDict {
var ret:HtmlFormDict = [:]
var idx = 0
while let name = atts(idx) {
ret[name] = atts(idx+1) ?? name
idx += 2
}
return ret
}
// https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L33
private func decode(_ bytes:UnsafePointer<xmlChar>?) -> String? {
guard let bytes = bytes else { return nil }
guard let (str, _) = String.decodeCString(bytes, as:UTF8.self, repairingInvalidCodeUnits:false) else { return nil }
return str
}
private func me(_ ptr : UnsafeRawPointer?) -> HtmlFormParser {
return Unmanaged<HtmlFormParser>.fromOpaque(ptr!).takeUnretainedValue()
}
private class HtmlFormParser {
private var forms : [String:HtmlFormDict] = [:]
private var form : HtmlFormDict = [:]
private var formName = ""
private var textName = ""
private var text = ""
func parse(_ data:Data?) -> [String:HtmlFormDict] {
guard let data = data else { return [:] }
var sax = htmlSAXHandler()
sax.initialized = XML_SAX2_MAGIC
sax.startElement = { me($0).startElement(name:$1, atts:$2) }
sax.endElement = { me($0).endElement(name:$1) }
sax.characters = { me($0).charactersFound(ch:$1, len:$2) }
// handler.error = errorEncounteredSAX
// https://curl.haxx.se/libcurl/c/htmltitle.html
// http://xmlsoft.org/html/libxml-HTMLparser.html#htmlParseChunk
// https://stackoverflow.com/questions/41140050/parsing-large-xml-from-server-while-downloading-with-libxml2-in-swift-3
// https://github.com/apple/swift-corelibs-foundation/blob/master/Foundation/XMLParser.swift#L524
// http://redqueencoder.com/wrapping-libxml2-for-swift/ bzw. https://github.com/SonoPlot/Swift-libxml
let ctxt = htmlCreatePushParserCtxt(&sax, Unmanaged.passUnretained(self).toOpaque(), "", 0, "", XML_CHAR_ENCODING_NONE)
defer { htmlFreeParserCtxt(ctxt) }
let _ = data.withUnsafeBytes { htmlParseChunk(ctxt, $0, Int32(data.count), 0) }
htmlParseChunk(ctxt, "", 0, 1)
return forms
}
private func startElement(name: UnsafePointer<xmlChar>? , atts:UnsafePointer<UnsafePointer<xmlChar>?>?) {
guard let atts = atts else { return }
// https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
return
}
guard let elm = decode(name) else { return }
let att = atts2dict({ decode(atts[$0]) })
let nam = att["name"] ?? att["id"] ?? ""
switch elm {
case "form":
formName = nam
form = [:]
case "textarea":
textName = nam
text = ""
case "input":
form[nam] = "checkbox" == att["type"]
? ("off" == att["checked"] ? nil : att["checked"])
: att["value"]
default:
break
}
}
private func endElement(name:UnsafePointer<xmlChar>?) {
// https://github.com/MaddTheSane/chmox/blob/3263ddf09276f6a47961cc4b87762f58b88772d0/CHMTableOfContents.swift#L75
guard let nam_ = UnsafeRawPointer(name)?.assumingMemoryBound(to: Int8.self) else { return }
if 0 != strcmp("form", nam_) && 0 != strcmp("input", nam_) && 0 != strcmp("textarea", nam_) {
return
}
let elm = decode(name)
switch elm {
case "form":
forms[formName] = form
formName = ""
case "textarea":
form[textName] = text
textName = ""
default:
break
}
}
private func charactersFound(ch: UnsafePointer<xmlChar>?, len: CInt) {
if (textName.isEmpty) {
return
}
let d = Data(bytes: ch!, count:Int(len)) // clamp
let s = String(data: d, encoding: .utf8) ?? "<utf8 decoding issue>"
text.append(s)
}
}