2015-08-14 2 views

так что в основном я пытаюсь получить всю информацию о веб-странице через селен, но со временем становится все медленнее ... до такой степени, что она не завершит список людей и не будет просто таймаутом.Web Scrape w/Selenium; Низкая производительность?

Я понимаю, что с течением времени он должен замедляться, поскольку я сохраняю HashSet идентификаторов, которые уже учтены и проверяют каждый цикл, чтобы увидеть, произошел ли ранее идентификатор.

Я приложил кучу кода, показывающего, как он извлекает данные с сайта, но я сомневаюсь, как основная проблема. Я думаю, что я пересматриваю что-то или что-то вроде утечки ресурсов или ограничения селена ...

Так что, если я запустил веб-менеджера в тихом режиме, это не испортит до 120 циклов, если я начну его без тишины с нормальным хронографом он в конечном итоге беспорядок и бросает и ошибки и пропускает людей ... Я предполагаю, потому что я коснулся веб-страницы, когда она обрабатывала или что-то в этом роде.

Все другие вопросы в сторону,

  • вы видите очевидные утечки ресурсов ли?
  • Знаете ли вы, почему он в конце концов прекращается и становится настолько медленным, что его нельзя использовать?
  • Есть ли какой-то мусор, который я не обрабатываю?
  • Как увеличить скорость?

класс WebManager:

public WebManager(string website) 

     driver = new ChromeDriver(); 

    public WebManager(Boolean quiet) 
     if (!quiet) 
      driver = new ChromeDriver(); 
      var processInfo = new ProcessStartInfo("java.exe", "-jar quietserver.jar") 
       CreateNoWindow = true, 
       UseShellExecute = false 
      quietServer = Process.Start(processInfo); 
      driver = new RemoteWebDriver(DesiredCapabilities.HtmlUnit()); 

Основной процесс программы:

public void doScrape() 
     int fileCount = Directory.GetDirectories(utils.savePath).Length; 
     int startCounty = (fileCount == 0 ? 1 : fileCount); 
     string lastOffenderId = null; 

     if (fileCount > 4 && localScrape) 
      Console.WriteLine("Please clear storage folders..."); 

     webManager = new WebManager(quiet); 

     for (int i = (localScrape ? 0 : startCounty); i <= (localScrape ? 2 : 64); i++) 
      webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i))); 
      HashSet<string> completedList = new HashSet<string>(); 
      string locationStr = webManager.getElementByxPath(countyxPath).Text; 
      Console.WriteLine("Working on county: " + locationStr.Substring(locationStr.IndexOf(':') + 2)); 
      locationStr = locationStr.Substring(locationStr.IndexOf(':') + 2); 
      for (int l = 2; l < 10000; l++) 
        var element1 = new WebDriverWait(webManager.driver, TimeSpan.FromSeconds(5)).Until(ExpectedConditions.ElementExists((By.XPath(getOffenderxPath(l))))); 
        string linkToOffender = element1.GetAttribute("href"); 
        string offenderId = linkToOffender.Substring(linkToOffender.IndexOf('=') + 1); 
        if (completedList.Contains(offenderId)) 
         Console.WriteLine("Offender id " + offenderId + " has multiple aliases one of which is: " + element1.Text); 
        lastOffenderId = offenderId; 

        var currentPlacement = webManager.getElementTextByxPath(currentPlacementxPath); 
        var lastName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 2)); 
        var firstName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 3)); 
        var middleName = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 4)); 
        var dob = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 5)); 
        var sex = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 6)); 
        var riskLevel = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 7)); 
        var designation = webManager.getElementTextByxPath(getOffenderInfoBasic(1, 8)); 
        Console.WriteLine("Offender info: " + currentPlacement + " " + lastName + " " + firstName + " " + middleName + " " + dob + " " + sex + " " + designation); 

        var race = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 1)); 
        var ethnicity = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 2)); 
        var height = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 3)); 
        var weight = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 4)); 
        var hair = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 5)); 
        var eyes = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 6)); 
        var lenses = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 7)); 
        var photodate = webManager.getElementTextByxPath(getOffenderInfoBasic(2, 8)); 

        var jurisdiction = webManager.getElementTextByxPath(jurisductionxPath); 

        // ------------ Logic for addresses ------------------------ 
        Address[] addresses; 
        List<Address> addressList = new List<Address>(); 

        for (int x = 1; x < 20; x++) 
          var address_1 = webManager.driver.FindElement(By.XPath(getOffenderAddress(x, 1))); 
          if (address_1 != null) 
           Address adds = new Address(); 
           adds.type = webManager.getElementTextByxPath(getOffenderAddress(x, 1)); 
           adds.county = webManager.getElementTextByxPath(getOffenderAddress(x, 2)); 
           adds.location = webManager.getElementTextByxPath(getOffenderAddress(x, 3)); 
         catch (NoSuchElementException e1) 

        Console.WriteLine(addressList.Count > 1 ? "Multiple addresses... listing" : "Only one address found"); 
        foreach (Address aa in addressList) 

        addresses = addressList.ToArray(); 
        // --------------- end of address logic -------------------- 

        //---------- Current Conviction logic ----------------------- 

        Conviction currentConviction = new Conviction(); 
        ConvictionDetails[] convictionDetails; 
        List<ConvictionDetails> currentConvictionDetails = new List<ConvictionDetails>(); 
        for (int x = 1; x < 20; x++) 
          * Not happy about this but it has to be done this way 
          * Checks the span[1] to see if it is still a conviction or if 
          * it starts the list of information. 
          * */ 
          var spanTitle = webManager.driver.FindElement(By.XPath(getConvictionTitlexPath(x))); 
          if (spanTitle.Text.Contains("Date")) 
          var title = webManager.driver.FindElement(By.XPath(getConvictionDetailsxPath(x, 1))); 
          var section = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 2)); 
          var subsection = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 3)); 
          var c_class = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 4)); 
          var categlory = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 5)); 
          var counts = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 6)); 
          var desc = webManager.getElementTextByxPath(getConvictionDetailsxPath(x, 7)); 

          ConvictionDetails cDetails = new ConvictionDetails(); 

          cDetails.c_class = c_class; 
          cDetails.categlory = categlory; 
          cDetails.counts = counts; 
          cDetails.description = desc; 
          cDetails.section = section; 
          cDetails.title = title.Text; 
          cDetails.subsection = subsection; 

         catch (NoSuchElementException e1) 

        convictionDetails = currentConvictionDetails.ToArray(); 

        var dateOfCrime = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 1)); 
        var convictionDate = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 2)); 
        var victiminfo = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 3)); 
        var arrestingAgency = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 4)); 
        var offenseDescription = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 5)); 
        var relationship = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 6)); 
        var weapon = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 7)); 
        var force = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 8)); 
        var computer = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 9)); 
        var porn = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 10)); 
        var sentance = webManager.getElementTextByxPath(getConvictionDataxPath(currentConvictionDetails.Count + 1, 11)); 

        currentConviction.arrestingAgency = arrestingAgency; 
        currentConviction.computerUsed = computer; 
        currentConviction.convictionDate = convictionDate; 
        currentConviction.crimeDate = dateOfCrime; 
        currentConviction.forceUsed = force; 
        currentConviction.offenseDescription = offenseDescription; 
        currentConviction.pornInvolved = porn; 
        currentConviction.relationship = relationship; 
        currentConviction.sentance = sentance; 
        currentConviction.victimInfo = victiminfo; 
        currentConviction.weaponsUsed = weapon; 
        currentConviction.details = convictionDetails; 

        Console.WriteLine("-------Current Conviction --------"); 
        //----------- End Current Conviction logic ------------------- 

        //----------- Pervious Conviction logic ---------------------- 

        Conviction[] previousConvictions = null; 
        int lastDiv = 0; 
        List<Conviction> previousConvictionsList = new List<Conviction>(); 
        for (int x = 3; x < 10; x++) 
         List<ConvictionDetails> prevConvictionDetailsList = new List<ConvictionDetails>(); 
         int last = 0; 
          for (int y = 1; y < 10; y++) 
            var spanTitle = webManager.driver.FindElement(By.XPath(getListTitlexPathByDiv(x, y))); 

            if (!spanTitle.Text.Contains("Title")) 

            var title = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 1)); 
            var section = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 2)); 
            var subsection = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 3)); 
            var c_class = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 4)); 
            var categlory = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 5)); 
            var counts = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 6)); 
            var desc = webManager.getElementTextByxPath(getListxPathByDiv(x, y, 7)); 

            ConvictionDetails prevconvictionDetails = new ConvictionDetails(); 

            prevconvictionDetails.c_class = c_class; 
            prevconvictionDetails.categlory = categlory; 
            prevconvictionDetails.counts = counts; 
            prevconvictionDetails.description = desc; 
            prevconvictionDetails.section = section; 
            prevconvictionDetails.title = title; 
            prevconvictionDetails.subsection = subsection; 

           catch (NoSuchElementException) 
          // keeps track of the divisions but putting it here in stack will increase by 1 always will account for it later 
          lastDiv = x; 
          if (prevConvictionDetailsList.Count == last) 
          last = prevConvictionDetailsList.Count; 


          var prevDateOfCrime = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 1)); 
          var prevConvictionDate = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 2)); 
          var prevVictiminfo = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 3)); 
          var prevArrestingAgency = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 4)); 
          var prevOffenseDescription = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 5)); 
          var prevRelationship = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 6)); 
          var prevWeapon = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 7)); 
          var prevForce = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 8)); 
          var prevComputer = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 9)); 
          var prevPorn = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 10)); 
          var prevSentance = webManager.getElementTextByxPath(getListxPathByDiv(x, prevConvictionDetailsList.Count + 1, 11)); 

          Conviction previousConviction = new Conviction(); 

          previousConviction.arrestingAgency = prevArrestingAgency; 
          previousConviction.computerUsed = prevComputer; 
          previousConviction.convictionDate = prevConvictionDate; 
          previousConviction.crimeDate = prevDateOfCrime; 
          previousConviction.forceUsed = prevForce; 
          previousConviction.offenseDescription = prevOffenseDescription; 
          previousConviction.pornInvolved = prevPorn; 
          previousConviction.relationship = prevRelationship; 
          previousConviction.sentance = prevSentance; 
          previousConviction.victimInfo = prevVictiminfo; 
          previousConviction.weaponsUsed = prevWeapon; 
          previousConviction.details = prevConvictionDetailsList.ToArray(); 

         catch (NoSuchElementException) 

        if (previousConvictionsList.Count > 0) 
         previousConvictions = previousConvictionsList.ToArray(); 

         Console.WriteLine("-----Previous convictions------"); 
         foreach (Conviction c in previousConvictions) 

        //-------------- End of Conviction logic --------------------- 

        //-----------------Beginning of Supervising until Scars-------- 
        int adjustedParagraph = (previousConvictions == null ? 4 : 3); 
        var supervisingAgency = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph)); 
        var specialConditions = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 1)); 
        var maximumExpire = webManager.getElementTextByxPath(getMainContentParagraph(adjustedParagraph + 2)); 
        //-----------------End of Supervising until Scars-------------- 

        //---------------- Scars logic -------------------------------- 

        * At this point the last paragraph used was adjustedParagrpah + 2 
        * */ 
        int lastParagraph = 0; 
        List<string> markingList = new List<string>(); 
        String[] markings = null; 
        for (int x = (adjustedParagraph + 3); x < (adjustedParagraph + 13); x++) 
         var marking1 = webManager.getElementTextByxPath(getMainContentParagraph(x)); 
         if (marking1.Contains("None")) 
          lastParagraph = x; 
         var aliasHeadingEle = webManager.getElementByxPath(aliasHeadingxPath); 
         var webEle1 = webManager.getElementByxPath(getMainContentParagraph(x)); 
         if (aliasHeadingEle.Location.Y > webEle1.Location.Y) 
          lastParagraph = x; 

        markings = markingList.ToArray(); 

        //------------------ End Scars logic ------------------------- 

        //------------------------ Alias Logic -------------------- 
        int lastParagraph2 = 0; 
        List<string> aliasList = new List<string>(); 
        String[] aliases = null; 
        for (int x = (lastParagraph + 1); x < (lastParagraph + 10); x++) 
          var alias1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x))); 
          if (alias1.Text.Contains("None")) 
           lastParagraph2 = x; 
         catch (NoSuchElementException) 

         var currentVehicleHeading = webManager.driver.FindElement(By.XPath(currentVehiclexPath)); 
         var webEle1 = webManager.driver.FindElement(By.XPath(getMainContentParagraph(x))); 
         if (currentVehicleHeading.Location.Y > webEle1.Location.Y) 
          Console.WriteLine("Offender has alias: " + webEle1.Text); 
          lastParagraph2 = x; 


        aliases = aliasList.ToArray(); 

        //------------------- End Alias logic ------------------- 

        //--------------------- Vehicle Logic --------------------------- 

        * I feel bad about doing work in a catch... but for some reason I can't think 
        * of a better way at the momment so I am just going with it. 
        * If you can make this logic better please do so... 
        * The lastdiv is already 1 more than last used due to placement read comments above 
        * */ 

        Vehicle[] vehicles = null; 
         var vehicleElement = webManager.driver.FindElement(By.XPath(getVehiclePxPath(lastDiv))); 
        catch (NoSuchElementException) 
         List<Vehicle> vehicleList = new List<Vehicle>(); 
         for (int x = 1; x < 10; x++) 
           var vehiclePlate = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 1))); 
           var vehicleState = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 2))); 
           var vehicleYear = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 3))); 
           var vehicleModel = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 4))); 
           var vehicleColor = webManager.driver.FindElement(By.XPath(getListxPathByDiv(lastDiv, x, 5))); 

           Vehicle vehicle1 = new Vehicle(); 

           vehicle1.color = vehicleColor.Text; 
           vehicle1.makeModel = vehicleModel.Text; 
           vehicle1.plate = vehiclePlate.Text; 
           vehicle1.state = vehicleState.Text; 
           vehicle1.year = vehicleYear.Text; 

          catch (NoSuchElementException) 
         vehicles = vehicleList.ToArray(); 

        //--------------------- End Vehicle Logic ------------------------- 

        //-------- Creating & Adding fields into Offender Object---------- 
        Offender offender = new Offender(); 

        offender.currentPlacement = currentPlacement; 
        offender.designation = designation; 
        offender.ethnicity = ethnicity; 
        offender.dob = dob; 
        offender.eyeColor = eyes; 
        offender.hairColor = hair; 
        offender.CorrectiveLens = lenses; 
        offender.height = height; 
        offender.weight = weight; 
        offender.photoDate = photodate; 
        offender.offenderId = offenderId; 
        offender.riskLevel = riskLevel; 
        offender.race = race; 
        offender.sex = sex; 
        offender.lastName = lastName; 
        offender.firstName = firstName; 
        offender.middleName = middleName; 
        offender.address = addresses; 
        offender.jurisdiction = jurisdiction; 
        offender.currentConviction = currentConviction; 
        offender.perviousConvictions = previousConvictions; 
        offender.supervisingInfo = supervisingAgency; 
        offender.conditions = specialConditions; 
        offender.maximumDate = maximumExpire; 
        offender.markings = markings; 
        offender.aliases = aliases; 
        offender.currentVehicles = vehicles; 
        offender.linkToPic = getPhotoLink(offenderId); 

        offender.Save(utils.getSaveLocation(locationStr, offender.offenderId)); 
        //------ add to completed offender id list -------- 
        webManager.driver.Navigate().GoToUrl(getOffenderListURL((localScrape ? localCounties[i] : i))); 
       catch (Exception e) 
        Console.WriteLine("Last offender id " + lastOffenderId); 



Закончилось переход на другой HTML Parser называется HTMLAgilityPack