| Semaphore |
| | UDF tratta dal forum ufficiale di Autoit... permette, praticamente, di "manipolare" il codice HTML di un sito web, permettendo di sapere (ad esempio) tutti gli URL contenuti in esso, le immagini, etc... CODICE #include-once
Global $_HTML_SEARCHMODE = 1 ; (0 = Compare / 1 = Substring / RegExp) (2 = Compare / 3 = Substring / String-compare)
#Region #current# ; _HTML_ExtractURLVar ; _HTML_Get ; _HTML_GetAllLinks ; _HTML_GetImageSrc ; _HTML_GetLink ; _HTML_GetSource ; _HTML_GetText ; _HTML_GetURLVar ; _HTML_ImageSave ; _HTML_Search #EndRegion #current#
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetSource ; Description ...: ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_GetSource($sURL) ; Parameter(s): .: $sURL - ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 23:12:10 CET 2010 ; Link ..........: ; Related .......: ; Example .......: No ; ============================================================================== Func _HTML_GetSource($sURL) Local $sHTML = InetRead($sURL, 1) If @error Then Return SetError(@error, @extended, "") $sHTML = BinaryToString($sHTML) $sHTML = StringRegExpReplace($sHTML, '[\r\n\t]', " ") $sHTML = StringRegExpReplace($sHTML, '(?i)<script.*?>.*?</script>', "") Return $sHTML EndFunc ;==>_HTML_GetSource
; #FUNCTION# =================================================================== ; Name ..........: _HTML_ExtractURLVar ; Description ...: Extracts an URL variable from an URL ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_ExtractURLVar($sURL, $sVar) ; Parameter(s): .: $sURL - URL ; $sVar - variable-name ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Thu Dec 24 13:28:27 CET 2009 ; ============================================================================== Func _HTML_ExtractURLVar($sURL, $sVar) Local $a = StringRegExp($sURL, '\?.*?' & $sVar & '=([\w%]+)(?:&|&)?', 3) If UBound($a) = 0 Then Return SetError(@error, @extended, "") Return $a[0] EndFunc ;==>_HTML_ExtractURLVar
; #FUNCTION# =================================================================== ; Name ..........: _HTML_Get ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_Get($sHTML, $sTag, $sAttributeGet, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sTag - HTML-tag ; $sAttributeGet - attribute to get the value from ; $sValue - value of the attribute to search ; $sAttribute - Optional: (Default = "id") : attribute to search ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:12:38 CET 2010 ; ============================================================================== Func _HTML_Get($sHTML, $sTag, $sAttributeGet, $sValue, $sAttribute = "id", $iIndex = 0)
Local Const $sE1 = '(?i)<' & $sTag & '(.*?)>' Local Const $sE2 = '(?i)' & $sAttribute & '\s*=\s*("|''|)' & __HTML_Search($sValue) & '\1' Local Const $sE3 = '(?i)' & $sAttributeGet & '\s*=\s*("|''|)(.*?)\1'
ConsoleWrite("_HTML_Get: " & @CRLF & $sE1 & @CRLF & $sE2 & @CRLF & $sE3 & @CRLF)
Local $a = StringRegExp($sHTML, $sE1, 3) If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE1 & @CRLF) Return SetError(1, 0, "") EndIf
Local $c = 0, $r For $i = 0 To UBound($a) - 1 ConsoleWrite($a[$i] & @CRLF) If StringRegExp($a[$i], $sE2) Then If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE2 & @CRLF) Return SetError(1, 0, "") EndIf $r = StringRegExp($a[$i], $sE3, 3) If @error = 2 Then ConsoleWriteError("_HTML_Get: Error in expression: " & $sE3 & @CRLF) Return SetError(1, 0, "") EndIf If $c = $iIndex Then ExitLoop $c += 1 EndIf Next
If UBound($r) = 0 Then Return SetError(1, 0, "") Return $r[1] EndFunc ;==>_HTML_Get
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetAllImageSrc ; Description ...: Returns an array with all image-srcs of the source-code ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetAllImageSrc($sHTML[, $sFilter = '.*?']) ; Parameter(s): .: $sHTML - HTML-Source ; $sFilter - Optional: (Default = '.*?') : RegEx-filter for the src ; Return Value ..: Success - Array ; Failure - ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:12:54 CET 2010 ; ============================================================================== Func _HTML_GetAllImageSrc($sHTML, $sFilter = '.*?') $sFilter = __HTML_Search($sFilter) Local $sE1 = '(?i)<img.*?src\s*=\s*(?:"(' & $sFilter & ')"|''(' & $sFilter & ')'').*?>' ConsoleWrite("_HTML_GetAllImageSrc:" & @CRLF & $sE1 & @CRLF) Local $r = StringRegExp($sHTML, $sE1, 3) Return SetError(@error, @extended, $r) EndFunc ;==>_HTML_GetAllImageSrc
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetAllLinks ; Description ...: Returns an array with all links of the source-code ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetAllLinks($sHTML[, $sFilter = '.*?']) ; Parameter(s): .: $sHTML - HTML-Source ; $sFilter - Optional: (Default = '.*?') : RegEx-filter for the href ; Return Value ..: Success - Array ; Failure - ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 22:13:08 CET 2010 ; ============================================================================== Func _HTML_GetAllLinks($sHTML, $sFilter = '.*?') $sFilter = __HTML_Search($sFilter) Local $sE1 = '(?i)<a.*?href\s*=\s*(?:"(' & $sFilter & ')"|''(' & $sFilter & ')'').*?>' ConsoleWrite("_HTML_GetAllLinks:" & @CRLF & $sE1 & @CRLF) Local $r = StringRegExp($sHTML, $sE1, 3) Return SetError(@error, @extended, $r) EndFunc ;==>_HTML_GetAllLinks
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetImageSrc ; Description ...: Returns the img-src of the specified image ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetImageSrc($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sValue - The value of the attribute ; $sAttribute - Optional: (Default = "id") : The attribute of the image ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - img-src ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 20:52:52 CET 2010 ; ============================================================================== Func _HTML_GetImageSrc($sHTML, $sValue, $sAttribute = "id", $iIndex = 0) Local $r = _HTML_Get($sHTML, "img", "src", $sValue, $sAttribute, $iIndex) Return SetError(@error, 0, $r) EndFunc ;==>_HTML_GetImageSrc
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetLink ; Description ...: Returns the href of the specified link ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetLink($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sValue - The value of the attribute ; $sAttribute - Optional: (Default = "id") : Attribute of the link ; $iIndex - Optional: (Default = 0) : ; Return Value ..: Success - href ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 20:52:56 CET 2010 ; ============================================================================== Func _HTML_GetLink($sHTML, $sValue, $sAttribute = "id", $iIndex = 0) Local $r = _HTML_Get($sHTML, "a", "href", $sValue, $sAttribute, $iIndex) Return SetError(@error, 0, $r) EndFunc ;==>_HTML_GetLink
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetTable ; Description ...: Returns a HTML-table as 2-dim array. ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_GetTable($sHTML[, $sValue = ""[, $sAttribute = "id"[, $iIndex = 0[, $iFilter = 30]]]]) ; Parameter(s): .: $sHTML - HTML-source ; $sValue - Optional: (Default = "") : ; $sAttribute - Optional: (Default = "id") : ; $iIndex - Optional: (Default = 0) : ; $iFilter - Optional: (Default = 30) : ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - array ; Failure - array ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 20:30:43 CET 2010 ; Link ..........: ; Related .......: ; Example .......: No ; ============================================================================== Func _HTML_GetTable($sHTML, $sValue = "", $sAttribute = "id", $iIndex = 0, $iFilter = 30)
Local $aRet[1][1]
$sHTML = _HTML_GetText($sHTML, "table", $sValue, $sAttribute, $iIndex, $iFilter) If @error Then Return SetError(1, 0, $aRet)
Local $aR = StringRegExp($sHTML, '(?i)<tr.*?>(.*?)</tr>', 3) If @error Then Return SetError(1, 0, $aRet)
Local $iR = UBound($aR), $aC, $iC For $j = 0 To $iR - 1 $aC = StringRegExp($aR[$j], '(?i)<(?:td|th).*?>(.*?)</(?:td|th)>', 3) If @error Then Return SetError(1, 0, $aRet)
$iC = UBound($aC) ReDim $aRet[$iR][$iC] For $k = 0 To $iC - 1 $aRet[$j][$k] = StringStripWS(__HTML_Filter($aC[$k], $iFilter), 3) Next Next
Return $aRet EndFunc ;==>_HTML_GetTable
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetText ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetText($sHTML, $sTag, $sValue[, $sAttribute = "id"[, $iIndex = 0[, $iFilter = 30]]]) ; Parameter(s): .: $sHTML - HTML-Source ; $sTag - HTML-tag ; $sValue - Optional: (Default = "") : value of this attribute ($_HTML_SEARCHMODE) ; $sAttribute - Optional: (Default = "id") : attribute in this tag ; $iIndex - Optional: (Default = 0) : index of the tag ; $iFilter - Optional: (Default = 30) : String filter (you can add them) ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - 1 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 27 20:26:32 CET 2010 ; ============================================================================== Func _HTML_GetText($sHTML, $sTag, $sValue = "", $sAttribute = "id", $iIndex = 0, $iFilter = 30)
Local $sE1
If $sValue And $sAttribute Then $iIndex = $iIndex * 2 + 1 $sE1 = '(?i)<' & $sTag & '\s+.*?' & $sAttribute & '\s*=\s*("|''|)' & __HTML_Search($sValue) & '\1.*?>(.*?)</' & $sTag & '>' Else $sE1 = '(?i)<' & $sTag & '.*?>(.*?)</' & $sTag & '>' EndIf
ConsoleWrite("_HTML_GetText: " & $sE1 & @CRLF)
Local $r = StringRegExp($sHTML, $sE1, 3) If @error = 2 Then ConsoleWriteError("_HTML_GetText: Error in expression: " & $sE1 & @CRLF) Return SetError(1, 0, "") EndIf
Local $iE = UBound($r) If $iE = 0 Or $iIndex >= $iE Then Return SetError(1, 0, "") If $iFilter Then __HTML_Filter($r[$iIndex], $iFilter) Return $r[$iIndex] EndFunc ;==>_HTML_GetText
; #FUNCTION# =================================================================== ; Name ..........: _HTML_GetURLVar ; Description ...: ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_GetURLVar($sHTML, $sVar, $sValue[, $sAttribute = "id"[, $iIndex = 0]]) ; Parameter(s): .: $sHTML - HTML-source ; $sVar - the variable in the URL ; $sValue - the value of the attribute in $sMode ; $sAttribute - Optional: (Default = "id") : attribute of the link ; $iIndex - Optional: (Default = 0) : index for the attribute ; Return Value ..: Success - string ; Failure - empty string ; @ERROR - (see help-file: StringRegExp) ; Author(s) .....: Thorsten Willert ; Date ..........: Fri Dec 25 10:26:26 CET 2009 ; ============================================================================== Func _HTML_GetURLVar($sHTML, $sVar, $sValue, $sAttribute = "id", $iIndex = 0) Local $sURL = _HTML_Get($sHTML, "a", "href", $sValue, $sAttribute, $iIndex) If @error Then Return SetError(@error, @extended, "") Local $s = _HTML_ExtractURLVar($sURL, $sVar) Return SetError(@error, @extended, $s) EndFunc ;==>_HTML_GetURLVar
; #FUNCTION# =================================================================== ; Name ..........: _HTML_ImageSave ; Description ...: ; AutoIt Version : V3.3.2.0 ; Syntax ........: _HTML_ImageSave($sHTML, $sValue[, $sAttribute = "id"[, $iIndex = 0[, $sBaseURL = ""[, $sDestDir = @SCRIPTDIR[, $sDestFile = ""]]]]]) ; Parameter(s): .: $sHTML - HTML-source ; $sValue - value of $sAttribute ; $sAttribute - Optional: (Default = "id") : attribute of the image ; $iIndex - Optional: (Default = 0) : index of the attribute ; $sBaseURL - Optional: (Default = "") : base url of the image (if there is no full path in the src) ; $sDestDir - Optional: (Default = @SCRIPTDIR) : directory where the image is saved ; $sDestFile - Optional: (Default = "") : file name (default is the orignal-name) ; Return Value ..: Success - 1 ; Failure - 0 ; @ERROR - ; Author(s) .....: Thorsten Willert ; Date ..........: Fri Jan 22 10:58:46 CET 2010 ; Link ..........: ; Related .......: ; Remarks .......: You can not use it, if relative paths are used in the image-src ; Example .......: Yes #cs #include <_html.au3> #include <inet.au3>
$HTML = _INetGetSource("http://autoit.de/index.php?page=Portal") _HTML_ImageSave($HTML, "registerS.png", "src", 0, "www.autoit.de", "c:\\") #ce ; ============================================================================== Func _HTML_ImageSave($sHTML, $sValue, $sAttribute = "id", $iIndex = 0, $sBaseURL = "", $sDestDir = @ScriptDir, $sDestFile = "")
Local $src = _HTML_GetImageSrc($sHTML, $sValue, $sAttribute, $iIndex) If Not $sBaseURL Then $sBaseURL = _HTML_Get($sHTML, "base", "", "", "href") If Not StringRegExp($sBaseURL, '^(?:http|ftp)s?://') Then $sBaseURL = "http://" & $sBaseURL If Not FileExists($sDestDir) Then $sDestDir = @ScriptDir If Not $sDestFile Then $sDestFile = $sDestDir & StringMid($src, StringInStr($src, "/", 2, -1)) If StringRight($sBaseURL, 1) <> "/" And StringLeft($src, 1) <> "/" Then $sBaseURL &= "/" If $sBaseURL Then $src = $sBaseURL & $src
If InetGet($src, $sDestFile, 1) Then ConsoleWrite("_HTML_ImageSave:" & @CRLF & "from:" & @TAB & $src & @CRLF & "to:" & @TAB & $sDestFile & @CRLF) Return FileExists($sDestFile) Else Return SetError(@error, 0, 0) EndIf EndFunc ;==>_HTML_ImageSave
; #FUNCTION# =================================================================== ; Name ..........: _HTML_Search ; Description ...: Searches only in the text of the HTML-source ; AutoIt Version : V3.3.0.0 ; Syntax ........: _HTML_Search($sHTML, $sSearch) ; Parameter(s): .: $sHTML - HTML-source ; $sSearch - the string to search ($_HTML_SEARCHMODE) ; Return Value ..: Success - 1 ; Failure - 0 ; Author(s) .....: Thorsten Willert ; Date ..........: Wed Jan 06 21:19:29 CET 2010 ; ============================================================================== Func _HTML_Search($sHTML, $sSearch) Return StringRegExp(StringRegExpReplace($sHTML, '<[^>]*>', ""), __HTML_Search($sSearch)) EndFunc ;==>_HTML_Search
;=============================================================================== Func __HTML_RegExMask($s) Return StringRegExpReplace($s, '(\$|\\|\+|\-|\.|\*|\(|\)|\[|\]|\{|\})+', '\\$1') EndFunc ;==>__HTML_RegExMask ;=============================================================================== Func __HTML_Search($s) If $s = '.*?' Then Return $s Switch $_HTML_SEARCHMODE Case 0 Return $s Case 1 Return '.*?' & $s & '.*?' Case 2 Return __HTML_RegExMask($s) Case 3 Return '.*?' & __HTML_RegExMask($s) & '.*?' Case Else Return $s EndSwitch EndFunc ;==>__HTML_Search
; #INTERNAL_USE_ONLY# ========================================================== ; Name ..........: __HTML_Filter ; Description ...: Filter for strings ; AutoIt Version : V3.3.0.0 ; Syntax ........: __HTML_Filter(ByRef $sString[, $iMode = 0]) ; Parameter(s): .: $sString - String to filter ; $iMode - Optional: (Default = 0) : removes nothing ; - 0 = no filter ; - 1 = removes non ascii characters ; - 2 = removes all double whitespaces ; - 4 = removes all double linefeeds ; - 8 = removes all html-tags ; - 16 = simple html-tag / entities convertor ; Return Value ..: Success - Filterd String ; Failure - Input String ; Author(s) .....: Thorsten Willert, Stephen Podhajecki {gehossafats at netmdc. com} _ConvertEntities ; Date ..........: Wed Jan 27 20:49:59 CET 2010 ; ============================================================================== Func __HTML_Filter(ByRef $sString, $iMode = 0) If $iMode = 0 Then Return $sString ;16 simple HTML tag / entities converter If $iMode >= 16 And $iMode < 32 Then Local $aEntities[96][2] = [[""", 34],["&", 38],["<", 60],[">", 62],[" ", 3],[" ", 32] _ ,["¡", 161],["¢", 162],["£", 163],["¤", 164],["¥", 165],["¦", 166] _ ,["§", 167],["¨", 168],["©", 169],["ª", 170],["¬", 172],["­", 173] _ ,["®", 174],["¯", 175],["°", 176],["±", 177],["²", 178],["³", 179] _ ,["´", 180],["µ", 181],["¶", 182],["·", 183],["¸", 184],["¹", 185] _ ,["º", 186],["»", 187],["¼", 188],["½", 189],["¾", 190],["¿", 191] _ ,["À", 192],["Á", 193],["Ã", 195],["Ä", 196],["Å", 197],["Æ", 198] _ ,["Ç", 199],["È", 200],["É", 201],["Ê", 202],["Ì", 204],["Í", 205] _ ,["Î", 206],["Ï", 207],["Ð", 208],["Ñ", 209],["Ò", 210],["Ó", 211] _ ,["Ô", 212],["Õ", 213],["Ö", 214],["×", 215],["Ø", 216],["Ù", 217] _ ,["Ú", 218],["Û", 219],["Ü", 220],["Ý", 221],["Þ", 222],["ß", 223] _ ,["à", 224],["á", 225],["â", 226],["ã", 227],["ä", 228],["å", 229] _ ,["æ", 230],["ç", 231],["è", 232],["é", 233],["ê", 234],["ë", 235] _ ,["ì", 236],["í", 237],["î", 238],["ï", 239],["ð", 240],["ñ", 241] _ ,["ò", 242],["ó", 243],["ô", 244],["õ", 245],["ö", 246],["÷", 247] _ ,["ø", 248],["ù", 249],["ú", 250],["û", 251],["ü", 252],["þ", 254]] $sString = StringRegExpReplace($sString, '(?i)<p.*?>', @CRLF & @CRLF) $sString = StringRegExpReplace($sString, '(?i)<br>', @CRLF) Local $iE = UBound($aEntities) - 1 For $x = 0 To $iE $sString = StringReplace($sString, $aEntities[$x][0], Chr($aEntities[$x][1]), 0, 2) Next For $x = 32 To 255 $sString = StringReplace($sString, "&#" & $x & ";", Chr($x)) Next $iMode -= 16 EndIf ;8 Tag filter If $iMode >= 8 And $iMode < 16 Then ;$sString = StringRegExpReplace($sString, '<script.*?>.*?</script>', "") $sString = StringRegExpReplace($sString, "<[^>]*>", "") $iMode -= 8 EndIf ; 4 remove all double cr, lf If $iMode >= 4 And $iMode < 8 Then $sString = StringRegExpReplace($sString, "([ \t]*[\n\r]+[ \t]*)", @CRLF) $sString = StringRegExpReplace($sString, "[\n\r]+", @CRLF) $iMode -= 4 EndIf ; 2 remove all double withespaces If $iMode = 2 Or $iMode = 3 Then $sString = StringRegExpReplace($sString, "[[:blank:]]+", " ") $sString = StringRegExpReplace($sString, "\n[[:blank:]]+", @CRLF) $sString = StringRegExpReplace($sString, "[[:blank:]]+\n", "") $iMode -= 2 EndIf ; 1 remove all non ASCII If $iMode = 1 Then $sString = StringRegExpReplace($sString, "[^\x00-\x7F]", " ") EndIf
Return $sString EndFunc ;==>__HTML_Filter
| | |
| |
|