Library RCurl and getURL

Can anyone explain the use of getURL . i tried to scarp a web page using following command

my_url<-"https://www.amazon.in/s?k=laptop&i=computers&crid=RWFD6L6HB7H0&sprefix=LA%2Caps%2C295&ref=nb_sb_ss_i_1_2"
html_page<-htmlTreeParse(my_url,useInternalNodes=TRUE).
above code gave the error
Error: XML content does not seem to be XML:
but after using the getURL fn under RCurl lib the error vanished . What was the catch ? Kindly explain

I think they're just very different implementations. The code for XML::htmlTreeParse() is

function (file, ignoreBlanks = TRUE, handlers = NULL, replaceEntities = FALSE, 
  asText = FALSE, trim = TRUE, validate = FALSE, getDTD = TRUE, 
  isURL = FALSE, asTree = FALSE, addAttributeNamespaces = FALSE, 
  useInternalNodes = FALSE, isSchema = FALSE, fullNamespaceInfo = FALSE, 
  encoding = character(), useDotNames = length(grep("^\\.", 
    names(handlers))) > 0, xinclude = TRUE, addFinalizer = TRUE, 
  error = htmlErrorHandler, isHTML = TRUE, options = integer(), 
  parentFirst = FALSE) 
{
  isMissingAsText = missing(asText)
  if (length(file) > 1) {
    file = paste(file, collapse = "\n")
    if (!missing(asText) && !asText) 
      stop(structure(list(message = "multiple URLs passed to xmlTreeParse. If this is the content of the file, specify asText = TRUE"), 
        class = c("MultipleURLError", "XMLParserError", 
          "simpleError", "error", "condition")))
    asText = TRUE
  }
  if (missing(isURL) && !asText) 
    isURL <- length(grep("^(http|ftp|file)://", file, useBytes = TRUE, 
      perl = TRUE))
  if (isHTML) {
    validate = FALSE
    getDTD = FALSE
    isSchema = FALSE
    docClass = "HTMLInternalDocument"
  }
  else docClass = character()
  checkHandlerNames(handlers, "DOM")
  if (missing(fullNamespaceInfo) && inherits(handlers, "RequiresNamespaceInfo")) 
    fullNamespaceInfo = TRUE
  oldValidate = xmlValidity()
  xmlValidity(validate)
  on.exit(xmlValidity(oldValidate))
  if (!asText && isURL == FALSE) {
    if (file.exists(file) == FALSE) 
      if (!missing(asText) && asText == FALSE) {
        e = simpleError(paste("File", file, "does not exist"))
        class(e) = c("FileNotFound", class(e))
        stop(e)
      }
      else asText <- TRUE
  }
  if (asText && length(file) > 1) 
    file = paste(file, collapse = "\n")
  old = setEntitySubstitution(replaceEntities)
  on.exit(setEntitySubstitution(old), add = TRUE)
  if (asText && length(grep(sprintf("^%s?\\s*<", BOMRegExp), 
    file, perl = TRUE, useBytes = TRUE)) == 0) {
    if (!isHTML || (isMissingAsText && !inherits(file, "AsIs"))) {
      e = simpleError(paste("XML content does not seem to be XML:", 
        sQuote(file)))
      class(e) = c("XMLInputError", class(e))
      (if (isHTML) 
        warning
      else stop)(e)
    }
  }
  if (!is.logical(xinclude)) {
    xinclude = as.logical(xinclude)
  }
  if (!asText && !isURL) 
    file = path.expand(as.character(file))
  if (useInternalNodes && trim) {
    prevBlanks = .Call("RS_XML_setKeepBlanksDefault", 0L, 
      PACKAGE = "XML")
    on.exit(.Call("RS_XML_setKeepBlanksDefault", prevBlanks, 
      PACKAGE = "XML"), add = TRUE)
  }
  .oldErrorHandler = setXMLErrorHandler(error)
  on.exit(.Call("RS_XML_setStructuredErrorHandler", .oldErrorHandler, 
    PACKAGE = "XML"), add = TRUE)
  if (length(options)) 
    options = sum(options)
  ans <- .Call("RS_XML_ParseTree", as.character(file), handlers, 
    as.logical(ignoreBlanks), as.logical(replaceEntities), 
    as.logical(asText), as.logical(trim), as.logical(validate), 
    as.logical(getDTD), as.logical(isURL), as.logical(addAttributeNamespaces), 
    as.logical(useInternalNodes), as.logical(isHTML), as.logical(isSchema), 
    as.logical(fullNamespaceInfo), as.character(encoding), 
    as.logical(useDotNames), xinclude, error, addFinalizer, 
    as.integer(options), as.logical(parentFirst), PACKAGE = "XML")
  if (!missing(handlers) && length(handlers) && !as.logical(asTree)) 
    return(handlers)
  if (!isSchema && length(class(ans))) 
    class(ans) = c(docClass, oldClass(class(ans)))
  if (inherits(ans, "XMLInternalDocument")) 
    addDocFinalizer(ans, addFinalizer)
  else if (!getDTD && !isSchema) {
    class(ans) = oldClass("XMLDocumentContent")
  }
  ans
}

And RCurl package uses libcurl under the hood (see description at site below):
http://www.omegahat.net/RCurl/

RCurl::getURL() uses libcurl under the hood to perform the request and retrieve the response.

1 Like

This topic was automatically closed 21 days after the last reply. New replies are no longer allowed.