Modul:Text: Unterschied zwischen den Versionen

Version vom 20. März 2015, 12:11 Uhr

--[=[ 2015-03-17 Text utilities ]=]

local Text = { } local PatternCJK = false local PatternCombined = false local PatternLatin = false local PatternTerminated = false local RangesLatin = false

Text.concatParams = function ( args, apply, adapt )

   -- Concat list items into one string
   -- Parameter:
   --     args   -- table (sequence) with numKey=string
   --     apply  -- string (optional); separator (default: "|")
   --     adapt  -- string (optional); format including "%s"
   -- Returns: string
   local collect = { }
   for k, v in pairs( args ) do
       if type( k ) == "number" then
           v = mw.text.trim( v )
           if v ~= "" then
               if adapt then
                   v = mw.ustring.format( adapt, v )
               end
               table.insert( collect, v )
           end
       end
   end
   return table.concat( collect,  apply or "|" )

end -- Text.concatParams()

Text.containsCJK = function ( analyse )

   -- Is any CJK code within?
   -- Parameter:
   --     analyse  -- string
   -- Returns: true, if CJK detected
   local r
   if not patternCJK then
       patternCJK = mw.ustring.char( 91,
                                      13312, 45,  40959,
                                     131072, 45, 178207,
                                     93 )
   end
   if mw.ustring.find( analyse, patternCJK ) then
       r = true
   else
       r = false
   end
   return r

end -- Text.containsCJK()

Text.isLatinRange = function ( adjust )

   -- Are characters expected to be latin or symbols within latin texts?
   -- Precondition:
   --     adjust  -- string, or nil for initialization
   -- Returns: true, if valid for latin only
   local r
   if not RangesLatin then
       RangesLatin = { {    7,  687 },
                       { 7531, 7578 },
                       { 7680, 7935 },
                       { 8194, 8250 } }
   end
   if not PatternLatin then
       local range
       PatternLatin = "^["
       for i = 1, #RangesLatin do
           range = RangesLatin[ i ]
           PatternLatin = PatternLatin ..
                          mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
       end    -- for i
       PatternLatin = PatternLatin .. "]*$"

mw.log(PatternLatin)

   end
   if adjust then
       if mw.ustring.match( adjust, PatternLatin ) then
           r = true
       else
           r = false
       end
   end
   return r

end -- Text.isLatinRange()

Text.listToText = function ( args, adapt )

   -- Format list items similar to mw.text.listToText()
   -- Parameter:
   --     args   -- table (sequence) with numKey=string
   --     adapt  -- string (optional); format including "%s"
   -- Returns: string
   local collect = { }
   for k, v in pairs( args ) do
       if type( k ) == "number" then
           v = mw.text.trim( v )
           if v ~= "" then
               if adapt then
                   v = mw.ustring.format( adapt, v )
               end
               table.insert( collect, v )
           end
       end
   end
   return mw.text.listToText( collect )

end -- Text.listToText()

Text.removeDiacritics = function ( adjust )

   -- Remove all diacritics
   -- Parameter:
   --     adjust  -- string
   -- Returns: string; all latin letters should be ASCII
   --                  or basic greek or cyrillic or symbols etc.
   local cleanup, decomposed
   if not PatternCombined then
       PatternCombined = mw.ustring.char( 91,
                                           0x0300, 45, 0x036F,
                                           0x1AB0, 45, 0x1AFF,
                                           0x1DC0, 45, 0x1DFF,
                                           0xFE20, 45, 0xFE2F,
                                          93 )
   end
   decomposed = mw.ustring.toNFD( adjust )
   cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
   return mw.ustring.toNFC( cleanup )

end -- Text.removeDiacritics()

Text.sentenceTerminated = function ( analyse )

   -- Is string terminated by dot, question or exclamation mark?
   --     Quotation, link termination and so on granted
   -- Parameter:
   --     analyse  -- string
   -- Returns: true, if sentence terminated
   local r
   if not PatternTerminated then
       PatternTerminated = mw.ustring.char( 91,
                                            12290,
                                            65281,
                                            65294,
                                            65311 )
                           .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
   end
   if mw.ustring.find( analyse, PatternTerminated ) then
       r = true
   else
       r = false
   end
   return r

end -- Text.sentenceTerminated()

Text.ucfirstAll = function ( adjust )

   -- Capitalize all words
   -- Precondition:
   --     adjust  -- string
   -- Returns: string with all first letters in upper case
   local r = " " .. adjust
   local i = 1
   local c, j, m
   if adjust:find( "&" ) then
       r = r:gsub( "&",      "&" )
            :gsub( "<",       "<" )
            :gsub( ">",       ">" )
            :gsub( " ",    " " )
            :gsub( " ", " " )
            :gsub( "‌",   "‌" )
            :gsub( "‍",    "‍" )
            :gsub( "‎",    "‎" )
            :gsub( "‏",    "‏" )
       m = true
   end
   while i do
       i = mw.ustring.find( r, "%W%l", i )
       if i then
           j = i + 1
           c = mw.ustring.upper( mw.ustring.sub( r, j, j ) )
           r = string.format( "%s%s%s",
                              mw.ustring.sub( r, 1, i ),
                              c,
                              mw.ustring.sub( r, i + 2 ) )
           i = j
       end
   end -- while i
   r = r:sub( 2 )
   if m then
       r = r:gsub(     "&", "&" )
            :gsub(     "<", "<" )
            :gsub(     ">", ">" )
            :gsub(    " ", " " )
            :gsub(   " ", " " )
            :gsub(   "‌", "‌" )
            :gsub(   "‍", "‍" )
            :gsub(   "‎", "‎" )
            :gsub(   "‏", "‏" )
            :gsub( "&#X(%x+);", "&#x%1;" )
   end
   return r

end -- Text.ucfirstAll()

Text.uprightNonlatin = function ( adjust )

   -- Ensure non-italics for non-latin text parts
   --     One single greek letter might be granted
   -- Precondition:
   --     adjust  -- string
   -- Returns: string with non-latin parts enclosed in 
   local r
   Text.isLatinRange()
   if mw.ustring.match( adjust, PatternLatin ) then
       -- latin only, horizontal dashes, quotes
       r = adjust
   else
       local c
       local j    = false
       local k    = 1
       local m    = false
       local n    = mw.ustring.len( adjust )
       local span = "%s%s%s"
       local flat = function ( a )
                 -- isLatin
                 local range
                 for i = 1, #RangesLatin do
                     range = RangesLatin[ i ]
                     if a >= range[ 1 ]  and  a <= range[ 2 ] then
                         return true
                     end
                 end    -- for i
             end -- flat()
       local form = function ( a )
               return string.format( span,
                                     r,
                                     mw.ustring.sub( adjust, k, j - 1 ),
                                     mw.ustring.sub( adjust, j, a ) )
             end -- form()
       r = ""
       for i = 1, n do
           c = mw.ustring.codepoint( adjust, i, i )
           if c > 64  or  c == 38  or  c == 60 then    -- '&' '<'
               if flat( c ) then
                   if j then
                       if m then
                           if i == m then
                               -- single greek letter.
                               j = false
                           end
                           m = false
                       end
                       if j then
                           local nx = i - 1
                           local s  = ""
                           for ix = nx, 1, -1 do
                               c = mw.ustring.sub( adjust, ix, ix )
                               if c == " "  or  c == "(" then
                                   nx = nx - 1
                                   s  = c .. s
                               else
                                   break -- for ix
                               end
                           end -- for ix
                           r = form( nx ) .. s
                           j = false
                           k = i
                       end
                   end
               elseif not j then
                   j = i
                   if c >= 880  and  c <= 1023 then
                       -- single greek letter?
                       m = i + 1
                   else
                       m = false
                   end
               end
           elseif m then
               m = m + 1
           end
       end    -- for i
       if j  and  ( not m  or  m < n ) then
           r = form( n )
       else
           r = r .. mw.ustring.sub( adjust, k )
       end
   end
   return r

end -- Text.uprightNonlatin()

-- Export local p = { }

function p.concatParams( frame )

   local args
   local template = frame.args.template
   if type( template ) == "string" then
       template = mw.text.trim( template )
       template = ( template == "1" )
   end
   if template then
       args = frame:getParent().args
   else
       args = frame.args
   end
   return Text.concatParams( args,
                             frame.args.separator,
                             frame.args.format )

end

function p.containsCJK( frame )

   return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""

end

function p.isLatinRange( frame )

   return Text.isLatinRange( frame.args[1] or "" ) and "1" or ""

end

function p.listToText( frame )

   local args
   local template = frame.args.template
   if type( template ) == "string" then
       template = mw.text.trim( template )
       template = ( template == "1" )
   end
   if template then
       args = frame:getParent().args
   else
       args = frame.args
   end
   return Text.listToText( args, frame.args.format )

end

function p.removeDiacritics( frame )

   return Text.removeDiacritics( frame.args[1] or "" )

end

function p.sentenceTerminated( frame )

   return Text.sentenceTerminated( frame.args[ 1 ] or "" ) and "1" or ""

end

function p.ucfirstAll( frame )

   return Text.ucfirstAll( frame.args[ 1 ] or "" )

end

function p.uprightNonlatin( frame )

   return Text.uprightNonlatin( frame.args[ 1 ] or "" )

end

function p.zip(frame) local lists = {} local seps = {} local defaultsep = frame.args["sep"] or "" local innersep = frame.args["isep"] or "" local outersep = frame.args["osep"] or ""

-- Parameter parsen for k, v in pairs(frame.args) do local knum = tonumber(k) if knum then lists[knum] = v else if string.sub(k, 1, 3) == "sep" then local sepnum = tonumber(string.sub(k, 4)) if sepnum then seps[sepnum] = v end end end end -- sofern keine expliziten Separatoren angegeben sind, den Standardseparator verwenden for i = 1, math.max(#seps, #lists) do if not seps[i] then seps[i] = defaultsep end end

-- Listen splitten local maxListLen = 0 for i = 1, #lists do lists[i] = mw.text.split(lists[i], seps[i]) if #lists[i] > maxListLen then maxListLen = #lists[i] end end

local result = "" for i = 1, maxListLen do if i ~= 1 then result = result .. outersep end for j = 1, #lists do if j ~= 1 then result = result .. innersep end result = result .. (lists[j][i] or "") end end return result end

p.Text = function ()

   return Text

end -- p.Text

return p

Modul:Text: Unterschied zwischen den Versionen

Version vom 20. März 2015, 12:11 Uhr

Navigationsmenü

Suche

@@ Zeile 1: / Zeile 1: @@
---[=[ 2015-01-15
+--[=[ 2015-03-17
 Text utilities
 ]=]
@@ Zeile 6: / Zeile 6: @@
 local Text = { }
-local patternCJK        = false
+local PatternCJK        = false
-local patternLatin      = false
+local PatternCombined   = false
-local patternTerminated = false
+local PatternLatin      = false
+local PatternTerminated = false
+local RangesLatin       = false
@@ Zeile 55: / Zeile 57: @@
      return r
 end -- Text.containsCJK()
+Text.isLatinRange = function ( adjust )
+    -- Are characters expected to be latin or symbols within latin texts?
+    -- Precondition:
+    --     adjust  -- string, or nil for initialization
+    -- Returns: true, if valid for latin only
+    local r
+    if not RangesLatin then
+        RangesLatin = { {    7,  687 },
+                        { 7531, 7578 },
+                        { 7680, 7935 },
+                        { 8194, 8250 } }
+    end
+    if not PatternLatin then
+        local range
+        PatternLatin = "^["
+        for i = 1, #RangesLatin do
+            range = RangesLatin[ i ]
+            PatternLatin = PatternLatin ..
+                           mw.ustring.char( range[ 1 ], 45, range[ 2 ] )
+        end    -- for i
+        PatternLatin = PatternLatin .. "]*$"
+mw.log(PatternLatin)
+    end
+    if adjust then
+        if mw.ustring.match( adjust, PatternLatin ) then
+            r = true
+        else
+            r = false
+        end
+    end
+    return r
+end -- Text.isLatinRange()
@@ Zeile 78: / Zeile 115: @@
      return mw.text.listToText( collect )
 end -- Text.listToText()
+Text.removeDiacritics = function ( adjust )
+    -- Remove all diacritics
+    -- Parameter:
+    --     adjust  -- string
+    -- Returns: string; all latin letters should be ASCII
+    --                  or basic greek or cyrillic or symbols etc.
+    local cleanup, decomposed
+    if not PatternCombined then
+        PatternCombined = mw.ustring.char( 91,
+x0300, 45, 0x036F,
+x1AB0, 45, 0x1AFF,
+x1DC0, 45, 0x1DFF,
+xFE20, 45, 0xFE2F,
+)
+    end
+    decomposed = mw.ustring.toNFD( adjust )
+    cleanup    = mw.ustring.gsub( decomposed, PatternCombined, "" )
+    return mw.ustring.toNFC( cleanup )
+end -- Text.removeDiacritics()
@@ Zeile 88: / Zeile 147: @@
      -- Returns: true, if sentence terminated
      local r
-     if not patternTerminated then
+     if not PatternTerminated then
-         patternTerminated = mw.ustring.char( 91,
+         PatternTerminated = mw.ustring.char( 91,
 ,
 ,
@@ Zeile 96: / Zeile 155: @@
                              .. "!%.%?…][\"'%]‹›«»‘’“”]*$"
      end
-     if mw.ustring.find( analyse, patternTerminated ) then
+     if mw.ustring.find( analyse, PatternTerminated ) then
          r = true
      else
@@ Zeile 163: / Zeile 222: @@
      -- Returns: string with non-latin parts enclosed in <span>
      local r
-     if not patternLatin then
+     Text.isLatinRange()
-        patternLatin = mw.ustring.char(   94, 91,
+     if mw.ustring.match( adjust, PatternLatin ) then
-, 45,  591,
-, 45, 8250,
-, 42, 36 )
-    end
-     if mw.ustring.match( adjust, patternLatin ) then
          -- latin only, horizontal dashes, quotes
          r = adjust
@@ Zeile 178: / Zeile 232: @@
          local m    = false
          local n    = mw.ustring.len( adjust )
-         local span = "%s%s<span style='font-style:normal'>%s</span>"
+         local span = "%s%s<span dir='auto' style='font-style:normal'>%s</span>"
          local flat = function ( a )
-                -- isLatin
+                  -- isLatin
-                return  a <= 591   or   ( a >= 8194  and  a <= 8250 )
+                  local range
+                  for i = 1, #RangesLatin do
+                      range = RangesLatin[ i ]
+                      if a >= range[ 1 ]  and  a <= range[ 2 ] then
+                          return true
+                      end
+                  end    -- for i
                end -- flat()
          local form = function ( a )
@@ Zeile 231: / Zeile 291: @@
                  m = m + 1
              end
-         end -- for i
+         end    -- for i
          if j  and  ( not m  or  m < n ) then
              r = form( n )
@@ Zeile 265: / Zeile 325: @@
 function p.containsCJK( frame )
      return Text.containsCJK( frame.args[ 1 ] or "" ) and "1" or ""
+end
+function p.isLatinRange( frame )
+    return Text.isLatinRange( frame.args[1] or "" ) and "1" or ""
 end
@@ Zeile 280: / Zeile 344: @@
      end
      return Text.listToText( args, frame.args.format )
+end
+function p.removeDiacritics( frame )
+    return Text.removeDiacritics( frame.args[1] or "" )
 end
@@ Zeile 300: / Zeile 368: @@
 	local innersep = frame.args["isep"] or ""
 	local outersep = frame.args["osep"] or ""
 	-- Parameter parsen
 	for k, v in pairs(frame.args) do
@@ Zeile 332: / Zeile 400: @@
 	end
 	return result
-end
--- removes all diacritics from the input string, by decomposing the characters, removing the combining diacritical marks and recomposing the remaining characters
-function p.removeDiacritics(frame)
-	local combiningDiacriticalMarks = "[" ..
-		mw.ustring.char(0x0300) .. "-" .. mw.ustring.char(0x036F) ..
-		mw.ustring.char(0x1AB0) .. "-" .. mw.ustring.char(0x1AFF) ..
-		mw.ustring.char(0x1DC0) .. "-" .. mw.ustring.char(0x1DFF) ..
-		mw.ustring.char(0xFE20) .. "-" .. mw.ustring.char(0xFE2F) ..
-		"]"
-	return mw.ustring.toNFC(mw.ustring.gsub(mw.ustring.toNFD(frame.args[1] or ""), combiningDiacriticalMarks, ""))
 end