HtmlNode.java
package us.codecraft.webmagic.selector;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import java.util.ArrayList;
import java.util.List;
import java.util.ListIterator;
/**
* @author code4crafer@gmail.com
*/
public class HtmlNode extends AbstractSelectable {
private final List<Element> elements;
public HtmlNode(List<Element> elements) {
this.elements = elements;
}
public HtmlNode() {
elements = null;
}
protected List<Element> getElements() {
return elements;
}
public Selectable smartContent() {
SmartContentSelector smartContentSelector = Selectors.smartContent();
return select(smartContentSelector, getSourceTexts());
}
@Override
public Selectable links() {
return selectElements(new LinksSelector());
}
@Override
public Selectable xpath(String xpath) {
XpathSelector xpathSelector = Selectors.xpath(xpath);
return selectElements(xpathSelector);
}
@Override
public Selectable selectList(Selector selector) {
if (selector instanceof BaseElementSelector) {
return selectElements((BaseElementSelector) selector);
}
return selectList(selector, getSourceTexts());
}
@Override
public Selectable select(Selector selector) {
return selectList(selector);
}
/**
* select elements
*
* @param elementSelector elementSelector
* @return result
*/
protected Selectable selectElements(BaseElementSelector elementSelector) {
ListIterator<Element> elementIterator = getElements().listIterator();
if (!elementSelector.hasAttribute()) {
List<Element> resultElements = new ArrayList<Element>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<Element> selectElements = elementSelector.selectElements(element);
resultElements.addAll(selectElements);
}
return new HtmlNode(resultElements);
} else {
// has attribute, consider as plaintext
List<String> resultStrings = new ArrayList<String>();
while (elementIterator.hasNext()) {
Element element = checkElementAndConvert(elementIterator);
List<String> selectList = elementSelector.selectList(element);
resultStrings.addAll(selectList);
}
return new PlainText(resultStrings);
}
}
/**
* Only document can be select
* See: https://github.com/code4craft/webmagic/issues/113
*
* @param elementIterator elementIterator
* @return element element
*/
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
Element element = elementIterator.next();
if (!(element instanceof Document)) {
Document root = new Document(element.ownerDocument().baseUri());
Element clone = element.clone();
root.appendChild(clone);
elementIterator.set(root);
return root;
}
return element;
}
@Override
public Selectable $(String selector) {
CssSelector cssSelector = Selectors.$(selector);
return selectElements(cssSelector);
}
@Override
public Selectable $(String selector, String attrName) {
CssSelector cssSelector = Selectors.$(selector, attrName);
return selectElements(cssSelector);
}
@Override
public List<Selectable> nodes() {
List<Selectable> selectables = new ArrayList<Selectable>();
for (Element element : getElements()) {
List<Element> childElements = new ArrayList<Element>(1);
childElements.add(element);
selectables.add(new HtmlNode(childElements));
}
return selectables;
}
@Override
protected List<String> getSourceTexts() {
List<String> sourceTexts = new ArrayList<String>(getElements().size());
for (Element element : getElements()) {
sourceTexts.add(element.toString());
}
return sourceTexts;
}
}