2016-07-01 3 views
1

Я следующий XML-файл, в котором я хотел бы, чтобы извлечь данные из с R. Обычно я использую функцию read_xml из пакета xml2 в сочетании с командой %>%. Но по какой-то причине это не работает. Он даже не читать XML в.Извлечение данных из XML с R

invoices <- read_xml(doclist[i]) %>% xml_nodes("page") 
invoices 
{xml_nodeset (0)} 

Данные, я бы извлечь это только текст после того, как дети <variantText> и хранить это dataframe. Таким образом, в этом примере

Klantbetaalnummer
Contactgegevens

<?xml version="1.0" encoding="UTF-8" standalone="yes"?> 
<document xmlns="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml" version="1.0" producer="FineReader 10.0" pagesCount="2" languages="" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml"> 
    <page width="2479" height="3508" resolution="300"> 
     <block blockType="Text" blockName="" l="292" t="108" r="590" b="194"><region><rect l="292" t="108" r="590" b="194"/></region> 
      <text> 
       <par align="Justified" lineSpacing="1200"> 
        <line baseline="138" l="298" t="114" r="584" b="138"><formatting lang="EnglishUnitedStates" ff="Arial" fs="8."> 
          <wordRecVariants> 
           <wordRecVariant wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" wordPenalty="0" meanStrokeWidth="31"><variantText>Klantbetaalnummer<charParams l="0" t="0" r="0" b="0">K</charParams><charParams l="0" t="0" r="0" b="0">l</charParams><charParams l="0" t="0" r="0" b="0">a</charParams><charParams l="0" t="0" r="0" b="0">n</charParams><charParams l="0" t="0" r="0" b="0">t</charParams><charParams l="0" t="0" r="0" b="0">b</charParams><charParams l="0" t="0" r="0" b="0">e</charParams><charParams l="0" t="0" r="0" b="0">t</charParams><charParams l="0" t="0" r="0" b="0">a</charParams><charParams l="0" t="0" r="0" b="0">a</charParams><charParams l="0" t="0" r="0" b="0">l</charParams><charParams l="0" t="0" r="0" b="0">n</charParams><charParams l="0" t="0" r="0" b="0">u</charParams><charParams l="0" t="0" r="0" b="0">m</charParams><charParams l="0" t="0" r="0" b="0">m</charParams><charParams l="0" t="0" r="0" b="0">e</charParams><charParams l="0" t="0" r="0" b="0">r</charParams> 
            </variantText> 
           </wordRecVariant> 
          </wordRecVariants> 
          <charParams l="298" t="114" r="318" b="138" wordStart="1" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="6" wordPenalty="0" meanStrokeWidth="31">K</charParams> 
          <charParams l="319" t="114" r="322" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="31">l</charParams> 
          <charParams l="326" t="120" r="341" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="16" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">a</charParams> 
          <charParams l="345" t="120" r="359" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">n</charParams> 
          <charParams l="362" t="114" r="370" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="28" wordPenalty="0" meanStrokeWidth="31">t</charParams> 
          <charParams l="373" t="114" r="388" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">b</charParams> 
          <charParams l="391" t="120" r="406" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="40" wordPenalty="0" meanStrokeWidth="31">e</charParams> 
          <charParams l="408" t="114" r="416" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="28" wordPenalty="0" meanStrokeWidth="31">t</charParams> 
          <charParams l="419" t="120" r="434" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="16" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">a</charParams> 
          <charParams l="437" t="120" r="452" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="16" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">a</charParams> 
          <charParams l="457" t="114" r="460" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="31">l</charParams> 
          <charParams l="464" t="120" r="478" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">n</charParams> 
          <charParams l="483" t="120" r="497" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="29" serifProbability="0" wordPenalty="0" meanStrokeWidth="31">u</charParams> 
          <charParams l="501" t="120" r="524" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="3" wordPenalty="0" meanStrokeWidth="31">m</charParams> 
          <charParams l="529" t="120" r="552" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="3" wordPenalty="0" meanStrokeWidth="31">m</charParams> 
          <charParams l="556" t="120" r="571" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="40" wordPenalty="0" meanStrokeWidth="31">e</charParams> 
          <charParams l="575" t="120" r="584" b="138" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="16" serifProbability="4" wordPenalty="0" meanStrokeWidth="31">r</charParams></formatting><formatting lang="EnglishUnitedStates" ff="Times New Roman" fs="10."></formatting></line> 
        <line baseline="188" l="298" t="164" r="441" b="188"><formatting lang="EnglishUnitedStates" ff="Arial" fs="8." bold="1"> 
          <wordRecVariants> 
           <wordRecVariant wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" wordPenalty="0" meanStrokeWidth="50"><variantText>10450320<charParams l="0" t="0" r="0" b="0">1</charParams><charParams l="0" t="0" r="0" b="0">0</charParams><charParams l="0" t="0" r="0" b="0">4</charParams><charParams l="0" t="0" r="0" b="0">5</charParams><charParams l="0" t="0" r="0" b="0">0</charParams><charParams l="0" t="0" r="0" b="0">3</charParams><charParams l="0" t="0" r="0" b="0">2</charParams><charParams l="0" t="0" r="0" b="0">0</charParams> 
            </variantText> 
           </wordRecVariant> 
          </wordRecVariants> 
          <charParams l="298" t="164" r="309" b="188" wordStart="1" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="46" serifProbability="67" wordPenalty="0" meanStrokeWidth="50">1</charParams> 
          <charParams l="315" t="164" r="330" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">0</charParams> 
          <charParams l="332" t="164" r="349" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">4</charParams> 
          <charParams l="352" t="164" r="367" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="44" wordPenalty="0" meanStrokeWidth="50">5</charParams> 
          <charParams l="370" t="164" r="385" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">0</charParams> 
          <charParams l="389" t="164" r="404" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="89" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">3</charParams> 
          <charParams l="407" t="164" r="422" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">2</charParams> 
          <charParams l="426" t="164" r="441" b="188" wordStart="0" wordFromDictionary="0" wordNormal="0" wordNumeric="1" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">0</charParams></formatting></line></par> 
      </text> 
     </block> 
     <block blockType="Text" blockName="" l="1826" t="383" r="2113" b="426"><region><rect l="1826" t="383" r="2113" b="426"/></region> 
      <text> 
       <par align="Justified"> 
        <line baseline="413" l="1832" t="389" r="2107" b="420"><formatting lang="EnglishUnitedStates" ff="Arial" fs="8." bold="1"> 
          <wordRecVariants> 
           <wordRecVariant wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" wordPenalty="0" meanStrokeWidth="50"><variantText>Contactgegevens<charParams l="0" t="0" r="0" b="0">C</charParams><charParams l="0" t="0" r="0" b="0">o</charParams><charParams l="0" t="0" r="0" b="0">n</charParams><charParams l="0" t="0" r="0" b="0">t</charParams><charParams l="0" t="0" r="0" b="0">a</charParams><charParams l="0" t="0" r="0" b="0">c</charParams><charParams l="0" t="0" r="0" b="0">t</charParams><charParams l="0" t="0" r="0" b="0">g</charParams><charParams l="0" t="0" r="0" b="0">e</charParams><charParams l="0" t="0" r="0" b="0">g</charParams><charParams l="0" t="0" r="0" b="0">e</charParams><charParams l="0" t="0" r="0" b="0">v</charParams><charParams l="0" t="0" r="0" b="0">e</charParams><charParams l="0" t="0" r="0" b="0">n</charParams><charParams l="0" t="0" r="0" b="0">s</charParams> 
            </variantText> 
           </wordRecVariant> 
          </wordRecVariants> 
          <charParams l="1832" t="389" r="1853" b="413" wordStart="1" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="51" wordPenalty="0" meanStrokeWidth="50">C</charParams> 
          <charParams l="1856" t="395" r="1874" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="255" wordPenalty="0" meanStrokeWidth="50">o</charParams> 
          <charParams l="1877" t="395" r="1893" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="50">n</charParams> 
          <charParams l="1895" t="389" r="1905" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="33" serifProbability="44" wordPenalty="0" meanStrokeWidth="50">t</charParams> 
          <charParams l="1908" t="395" r="1924" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="50">a</charParams> 
          <charParams l="1926" t="395" r="1942" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="51" wordPenalty="0" meanStrokeWidth="50">c</charParams> 
          <charParams l="1944" t="389" r="1954" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="33" serifProbability="44" wordPenalty="0" meanStrokeWidth="50">t</charParams> 
          <charParams l="1956" t="395" r="1973" b="420" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="12" wordPenalty="0" meanStrokeWidth="50">g</charParams> 
          <charParams l="1976" t="395" r="1992" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="39" wordPenalty="0" meanStrokeWidth="50">e</charParams> 
          <charParams l="1995" t="395" r="2012" b="420" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="12" wordPenalty="0" meanStrokeWidth="50">g</charParams> 
          <charParams l="2015" t="395" r="2031" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="39" wordPenalty="0" meanStrokeWidth="50">e</charParams> 
          <charParams l="2033" t="395" r="2050" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="5" wordPenalty="0" meanStrokeWidth="50">v</charParams> 
          <charParams l="2052" t="395" r="2068" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="39" wordPenalty="0" meanStrokeWidth="50">e</charParams> 
          <charParams l="2072" t="395" r="2088" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="0" wordPenalty="0" meanStrokeWidth="50">n</charParams> 
          <charParams l="2091" t="395" r="2107" b="413" wordStart="0" wordFromDictionary="0" wordNormal="1" wordNumeric="0" wordIdentifier="0" charConfidence="100" serifProbability="57" wordPenalty="0" meanStrokeWidth="50">s</charParams></formatting></line></par> 
      </text> 
     </block> 
    </page> 
</document> 

ответ

0

У вашего документа есть связанное с ним пространство имен, поэтому вам нужно указать пространство имен в пути. Попробуйте следующее:

library(rvest) 
page<-read_xml("test.xml") 
#check for name space: 
xml_ns(page) 

#read nodes with namespace 
nodes<-xml_nodes(page, ".//d1:variantText") 
+0

Нужно ли мне изменить «d1» на результат 'xml_ns'? Потому что он еще не работает. Или это требует css или xpath? 'xml_ns (страница) d1 <-> http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml XSI <-> HTTP: // www.w3.org/2001/XMLSchema-instance' – Jelmer

+0

You также можно попробовать просто использовать xml_find_all (страница, ".//d1:variantText", xml_ns (страница)). Это также работает, но когда я использовал xml_text, он также возвращал значения из charParams. – Dave2e

0

Я не смотрел на то, почему ваш XML не читать, но другое решение было бы использовать регулярное выражение.

library(stringr) 

str_match(doclist, "<variantText>(.*)</variantText>")