2015-05-02 2 views

У меня есть задача конвертировать несколько сотен PDF-документов в формат HTML. Я попробовал несколько автономных конвертеров, но у них проблемы с обработкой столбцов и дефисным обоснованием.Автоматизация использования MS Word как конвертер PDF в HTML

Однако, если я просто открываю документ в MS Word, он отлично их обрабатывает.

Итак, в принципе, мне нужен способ автоматизировать открытие PDF в Word, ожидая его обработки и сохранения в виде HTML (отфильтрованного).

Кто-нибудь знает, как я могу это сделать?



Мы можем сделать это в PowerShell.

Перерыв ваше требование двух частей

FIRST - Преобразование PDF в Word Document

'**** The script runs in a loop until it detects a new file in the directory 
'**** It checks the source folder every 30 seconds. To change this interval 
'**** change the sleep time at the end of the program to the number of seconds x 1000 
'**** wscript.sleep (15000) would check the folder every 15 seconds 
'**** The program uses AnyBizSoft PDtoWord converter. It is available as a free version off of 
'*** facebook here - Facebook - http://www.facebook.com/AnyBizSoft?v=app_6009294086 
'*** You can buy it from thier website at http://www.anypdftools.com/pdf-to-word.html 
'*** I have no connection with them other than they were the first one i found that worked with 
'*** a command line. 
'*** The script uses two directories. C:\Source\ is where pdf files are copied to 
'*** C:\Converted is where the converted file is placed. It is either a doc file if you have Office 2003 or 
'*** older or a docx if you have Office 2007 or newer. 
'*** After the file is converted the original pdf is deleted. This can be changed by commenting out the 
'*** Line that deletes the file near the end of the script. 
'*** The script can be placed anywhere, but the pdftoword folder needs to be copied from the program files 
'*** directory to the c:\source folder 
'Option Explicit 
Set objFSO = CreateObject("Scripting.FileSystemObject") 
Dim strComputer 
strComputer = "." 
spath="C:\source\"  '*** Source directory 
dpath="C:\converted\" '*** Destination or Converted Directory 

Set objWMIService = GetObject("winmgmts:" _ 
    & "{impersonationLevel=impersonate}!\\" & _ 
     strComputer & "\root\cimv2") 
Set colMonitoredEvents = objWMIService.ExecNotificationQuery _ 
    ("SELECT * FROM __InstanceCreationEvent WITHIN 10 WHERE " _ 
     & "Targetinstance ISA 'CIM_DirectoryContainsFile' and " _ 
      & "TargetInstance.GroupComponent= " _ 
       & "'Win32_Directory.Name=""c:\\\\source""'") 

    set sourcefolder=objFso.GetFolder(spath) 
     set sourcefiles = sourcefolder.files 
     for each objFile in sourcefiles 
       sourcefile = objFile.name 
    loop until (numfiles > 0) 

'*** Call pdftoword to convert the file 
    Set wshShell = WScript.CreateObject ("WSCript.shell") 
    convertstr="c:\source\pdftoword\pdftoword.exe " & chr(34) & spath & sourcefile & chr(34) 
    wshshell.run convertstr, 6, false 

Do  '*** Wait for docx to be created before continuing 
    Set objLatestEvent = colMonitoredEvents.NextEvent 
loop until (instr(objLatestEvent.TargetInstance.PartComponent,"doc") > 0)   

'*** Make time stamp for file name 
     d = Now 
    hhmmss = Right("00" & Hour(d), 2) & Right("00" & Minute(d), 2) & Right("00" & Second(d), 2) 

'*** Get just the filename without the extension 
    sourcefilename = left(sourcefile,instr(sourcefile,".")-1) 

'*** Add the timestamp to the converted file  
    newname = sourcefilename & "-" & hhmmss 

'*** Exit program if file exists in the destination folder. Highly unlikely since it is timestamped 
if objfso.FileExists(dpath & newname & ".docx") then 
     wscript.echo "Destination file " & dpath & newname & ".docx exists already" 
end if 
if objfso.FileExists("c:\converted\" & newname & ".doc") then 
     wscript.echo "Destination file " & dpath & newname & ".doc exists already" 
end if 

'*** move converted file to the converted folder then delete original 
if objfso.FileExists(spath & sourcefilename & ".docx") then 
     newname= dpath & newname & ".docx" 
     oldname = spath & sourcefilename & ".docx" 
    objfso.Movefile "" & oldname & "", "" & newname & "" 
    objfso.DeleteFile("" & spath & sourcefile & "") 'Delete or comment this line if you do not want the original deleted 
end if 

if objfso.FileExists(spath & sourcefilename & ".doc") then 
     newname= dpath & newname & ".doc" 
     oldname = spath & sourcefilename & ".doc" 
    objfso.Movefile "" & oldname & "", "" & newname & "" 
    objfso.DeleteFile("" & spath & sourcefile & "") 'Delete or comment this line if you do not want the original deleted 
end if 

' *** Kill PDFtoword process 
Set colProcess = objWMIService.ExecQuery _ 
("Select * from Win32_Process Where Name = 'PDFtoWord.exe'" ) 
For Each objProcess in colProcess 

wscript.sleep (30000) 'Wait 30 seconds to look for next file. 1000 = 1 second 

ВТОРОЙ - Преобразование документов Word To HTML

param([string]$docpath,[string]$htmlpath = $docpath) 

$srcfiles = Get-ChildItem $docPath -filter "*.doc" 
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML"); 
$word = new-object -comobject word.application 
$word.Visible = $False 

function saveas-filteredhtml 
     $opendoc = $word.documents.open($doc.FullName); 
     $opendoc.saveas([ref]"$htmlpath\$doc.fullname.html", [ref]$saveFormat); 

ForEach ($doc in $srcfiles) 
     Write-Host "Processing :" $doc.FullName 
     $doc = $null 


Сохраните этот код convertdoc-tohtml.ps1, и вы можете запустить его на наборе текстовых документов независимо от расширения doc или docx.

Вот как вы можете запустить его:

convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output" 
Смежные вопросы