My ADALINE model using Gradient Descent is increasing error on each iteration












4












$begingroup$


I have used the Iris Dataset's 1st and 3rd Column for the features. and the labels of Iris Setosa (-1) and Iris Versicolor (1). I am using ADALINE as a simple classification model for my dataset. I am using gradient descent as the cost minimizing function. But on every iteration the error increases. What am I doing wrong in the python code?



import numpy as np
import pandas as pd

class AdalineGD(object):

def __init__(self, eta = 0.01, n_iter = 50):
self.eta = eta
self.n_iter = n_iter

def fit (self, X, y):
"""Fit training data."""

self.w_ = np.random.random(X.shape[1])
self.cost_ =
print ('Initial weights are: %r' %self.w_)
for i in range(self.n_iter):
output = self.net_input(X)
print ("On iteration %d, output is: %r" %(i, output))
errors = output - y
print("On iteration %d, Error is: %r" %(i, errors))
self.w_ += self.eta * X.T.dot(errors)
print ('Weights on iteration %d: %r' %(i, self.w_))
cost = (errors**2).sum() / 2.0
self.cost_.append(cost)
print ("On iteration %d, Cost is: %r" %(i, cost))
prediction = self.predict(X)
print ("Prediction after iteration %d is: %r" %(i, prediction))
input()
return self

def net_input(self, X):
"""Calculate net input"""
return X.dot(self.w_)

def activation(self, X):
"""Computer Linear Activation"""
return self.net_input(X)

def predict(self, X):
"""Return class label after unit step"""
return np.where(self.activation(X) >= 0.0, 1, -1)

####### END OF THE CLASS ########
#importing the Iris Dataset
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
y = df.iloc[0:100, 4].values
y = np.where(y == 'Iris-setosa', -1, 1)
X = df.iloc[0:100, [0, 2]].values
#Adding the ones column to the X matrix
X = np.insert(X, 0, np.ones(X.shape[0]), axis = 1)
ada = AdalineGD(n_iter = 20, eta = 0.001).fit(X, y)









share|improve this question









$endgroup$

















    4












    $begingroup$


    I have used the Iris Dataset's 1st and 3rd Column for the features. and the labels of Iris Setosa (-1) and Iris Versicolor (1). I am using ADALINE as a simple classification model for my dataset. I am using gradient descent as the cost minimizing function. But on every iteration the error increases. What am I doing wrong in the python code?



    import numpy as np
    import pandas as pd

    class AdalineGD(object):

    def __init__(self, eta = 0.01, n_iter = 50):
    self.eta = eta
    self.n_iter = n_iter

    def fit (self, X, y):
    """Fit training data."""

    self.w_ = np.random.random(X.shape[1])
    self.cost_ =
    print ('Initial weights are: %r' %self.w_)
    for i in range(self.n_iter):
    output = self.net_input(X)
    print ("On iteration %d, output is: %r" %(i, output))
    errors = output - y
    print("On iteration %d, Error is: %r" %(i, errors))
    self.w_ += self.eta * X.T.dot(errors)
    print ('Weights on iteration %d: %r' %(i, self.w_))
    cost = (errors**2).sum() / 2.0
    self.cost_.append(cost)
    print ("On iteration %d, Cost is: %r" %(i, cost))
    prediction = self.predict(X)
    print ("Prediction after iteration %d is: %r" %(i, prediction))
    input()
    return self

    def net_input(self, X):
    """Calculate net input"""
    return X.dot(self.w_)

    def activation(self, X):
    """Computer Linear Activation"""
    return self.net_input(X)

    def predict(self, X):
    """Return class label after unit step"""
    return np.where(self.activation(X) >= 0.0, 1, -1)

    ####### END OF THE CLASS ########
    #importing the Iris Dataset
    df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
    y = df.iloc[0:100, 4].values
    y = np.where(y == 'Iris-setosa', -1, 1)
    X = df.iloc[0:100, [0, 2]].values
    #Adding the ones column to the X matrix
    X = np.insert(X, 0, np.ones(X.shape[0]), axis = 1)
    ada = AdalineGD(n_iter = 20, eta = 0.001).fit(X, y)









    share|improve this question









    $endgroup$















      4












      4








      4


      1



      $begingroup$


      I have used the Iris Dataset's 1st and 3rd Column for the features. and the labels of Iris Setosa (-1) and Iris Versicolor (1). I am using ADALINE as a simple classification model for my dataset. I am using gradient descent as the cost minimizing function. But on every iteration the error increases. What am I doing wrong in the python code?



      import numpy as np
      import pandas as pd

      class AdalineGD(object):

      def __init__(self, eta = 0.01, n_iter = 50):
      self.eta = eta
      self.n_iter = n_iter

      def fit (self, X, y):
      """Fit training data."""

      self.w_ = np.random.random(X.shape[1])
      self.cost_ =
      print ('Initial weights are: %r' %self.w_)
      for i in range(self.n_iter):
      output = self.net_input(X)
      print ("On iteration %d, output is: %r" %(i, output))
      errors = output - y
      print("On iteration %d, Error is: %r" %(i, errors))
      self.w_ += self.eta * X.T.dot(errors)
      print ('Weights on iteration %d: %r' %(i, self.w_))
      cost = (errors**2).sum() / 2.0
      self.cost_.append(cost)
      print ("On iteration %d, Cost is: %r" %(i, cost))
      prediction = self.predict(X)
      print ("Prediction after iteration %d is: %r" %(i, prediction))
      input()
      return self

      def net_input(self, X):
      """Calculate net input"""
      return X.dot(self.w_)

      def activation(self, X):
      """Computer Linear Activation"""
      return self.net_input(X)

      def predict(self, X):
      """Return class label after unit step"""
      return np.where(self.activation(X) >= 0.0, 1, -1)

      ####### END OF THE CLASS ########
      #importing the Iris Dataset
      df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
      y = df.iloc[0:100, 4].values
      y = np.where(y == 'Iris-setosa', -1, 1)
      X = df.iloc[0:100, [0, 2]].values
      #Adding the ones column to the X matrix
      X = np.insert(X, 0, np.ones(X.shape[0]), axis = 1)
      ada = AdalineGD(n_iter = 20, eta = 0.001).fit(X, y)









      share|improve this question









      $endgroup$




      I have used the Iris Dataset's 1st and 3rd Column for the features. and the labels of Iris Setosa (-1) and Iris Versicolor (1). I am using ADALINE as a simple classification model for my dataset. I am using gradient descent as the cost minimizing function. But on every iteration the error increases. What am I doing wrong in the python code?



      import numpy as np
      import pandas as pd

      class AdalineGD(object):

      def __init__(self, eta = 0.01, n_iter = 50):
      self.eta = eta
      self.n_iter = n_iter

      def fit (self, X, y):
      """Fit training data."""

      self.w_ = np.random.random(X.shape[1])
      self.cost_ =
      print ('Initial weights are: %r' %self.w_)
      for i in range(self.n_iter):
      output = self.net_input(X)
      print ("On iteration %d, output is: %r" %(i, output))
      errors = output - y
      print("On iteration %d, Error is: %r" %(i, errors))
      self.w_ += self.eta * X.T.dot(errors)
      print ('Weights on iteration %d: %r' %(i, self.w_))
      cost = (errors**2).sum() / 2.0
      self.cost_.append(cost)
      print ("On iteration %d, Cost is: %r" %(i, cost))
      prediction = self.predict(X)
      print ("Prediction after iteration %d is: %r" %(i, prediction))
      input()
      return self

      def net_input(self, X):
      """Calculate net input"""
      return X.dot(self.w_)

      def activation(self, X):
      """Computer Linear Activation"""
      return self.net_input(X)

      def predict(self, X):
      """Return class label after unit step"""
      return np.where(self.activation(X) >= 0.0, 1, -1)

      ####### END OF THE CLASS ########
      #importing the Iris Dataset
      df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header = None)
      y = df.iloc[0:100, 4].values
      y = np.where(y == 'Iris-setosa', -1, 1)
      X = df.iloc[0:100, [0, 2]].values
      #Adding the ones column to the X matrix
      X = np.insert(X, 0, np.ones(X.shape[0]), axis = 1)
      ada = AdalineGD(n_iter = 20, eta = 0.001).fit(X, y)






      machine-learning python classification gradient-descent






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked Jan 15 '17 at 8:02









      Himanshu AhujaHimanshu Ahuja

      287




      287






















          2 Answers
          2






          active

          oldest

          votes


















          1












          $begingroup$

          I think something is wrong here.



          self.w_ += self.eta * X.T.dot(errors)


          You are going to the positive to the gradient while you should be doing is going to the negative direction of it



          self.w_ -= self.eta * X.T.dot(errors)


          or



          self.w_ += -self.eta * X.T.dot(errors)


          see this for more clarification.






          share|improve this answer









          $endgroup$













          • $begingroup$
            After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
            $endgroup$
            – Himanshu Ahuja
            Jan 15 '17 at 16:24






          • 2




            $begingroup$
            Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
            $endgroup$
            – Pierre
            Jan 15 '17 at 17:59










          • $begingroup$
            Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
            $endgroup$
            – Kiritee Gak
            Jan 15 '17 at 18:38



















          0












          $begingroup$

          If you want to do



          self.w_ += self.eta * X.T.dot(errors)


          like i like to do.



          you just have to change



          errors = output - y


          to



          errors = y - output


          Hope this helps : )






          share|improve this answer










          New contributor




          Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
          Check out our Code of Conduct.






          $endgroup$













            Your Answer





            StackExchange.ifUsing("editor", function () {
            return StackExchange.using("mathjaxEditing", function () {
            StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
            StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["$", "$"], ["\\(","\\)"]]);
            });
            });
            }, "mathjax-editing");

            StackExchange.ready(function() {
            var channelOptions = {
            tags: "".split(" "),
            id: "557"
            };
            initTagRenderer("".split(" "), "".split(" "), channelOptions);

            StackExchange.using("externalEditor", function() {
            // Have to fire editor after snippets, if snippets enabled
            if (StackExchange.settings.snippets.snippetsEnabled) {
            StackExchange.using("snippets", function() {
            createEditor();
            });
            }
            else {
            createEditor();
            }
            });

            function createEditor() {
            StackExchange.prepareEditor({
            heartbeatType: 'answer',
            autoActivateHeartbeat: false,
            convertImagesToLinks: false,
            noModals: true,
            showLowRepImageUploadWarning: true,
            reputationToPostImages: null,
            bindNavPrevention: true,
            postfix: "",
            imageUploader: {
            brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
            contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
            allowUrls: true
            },
            onDemand: true,
            discardSelector: ".discard-answer"
            ,immediatelyShowMarkdownHelp:true
            });


            }
            });














            draft saved

            draft discarded


















            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f16324%2fmy-adaline-model-using-gradient-descent-is-increasing-error-on-each-iteration%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown

























            2 Answers
            2






            active

            oldest

            votes








            2 Answers
            2






            active

            oldest

            votes









            active

            oldest

            votes






            active

            oldest

            votes









            1












            $begingroup$

            I think something is wrong here.



            self.w_ += self.eta * X.T.dot(errors)


            You are going to the positive to the gradient while you should be doing is going to the negative direction of it



            self.w_ -= self.eta * X.T.dot(errors)


            or



            self.w_ += -self.eta * X.T.dot(errors)


            see this for more clarification.






            share|improve this answer









            $endgroup$













            • $begingroup$
              After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
              $endgroup$
              – Himanshu Ahuja
              Jan 15 '17 at 16:24






            • 2




              $begingroup$
              Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
              $endgroup$
              – Pierre
              Jan 15 '17 at 17:59










            • $begingroup$
              Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
              $endgroup$
              – Kiritee Gak
              Jan 15 '17 at 18:38
















            1












            $begingroup$

            I think something is wrong here.



            self.w_ += self.eta * X.T.dot(errors)


            You are going to the positive to the gradient while you should be doing is going to the negative direction of it



            self.w_ -= self.eta * X.T.dot(errors)


            or



            self.w_ += -self.eta * X.T.dot(errors)


            see this for more clarification.






            share|improve this answer









            $endgroup$













            • $begingroup$
              After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
              $endgroup$
              – Himanshu Ahuja
              Jan 15 '17 at 16:24






            • 2




              $begingroup$
              Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
              $endgroup$
              – Pierre
              Jan 15 '17 at 17:59










            • $begingroup$
              Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
              $endgroup$
              – Kiritee Gak
              Jan 15 '17 at 18:38














            1












            1








            1





            $begingroup$

            I think something is wrong here.



            self.w_ += self.eta * X.T.dot(errors)


            You are going to the positive to the gradient while you should be doing is going to the negative direction of it



            self.w_ -= self.eta * X.T.dot(errors)


            or



            self.w_ += -self.eta * X.T.dot(errors)


            see this for more clarification.






            share|improve this answer









            $endgroup$



            I think something is wrong here.



            self.w_ += self.eta * X.T.dot(errors)


            You are going to the positive to the gradient while you should be doing is going to the negative direction of it



            self.w_ -= self.eta * X.T.dot(errors)


            or



            self.w_ += -self.eta * X.T.dot(errors)


            see this for more clarification.







            share|improve this answer












            share|improve this answer



            share|improve this answer










            answered Jan 15 '17 at 15:07









            Kiritee GakKiritee Gak

            1,2111420




            1,2111420












            • $begingroup$
              After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
              $endgroup$
              – Himanshu Ahuja
              Jan 15 '17 at 16:24






            • 2




              $begingroup$
              Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
              $endgroup$
              – Pierre
              Jan 15 '17 at 17:59










            • $begingroup$
              Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
              $endgroup$
              – Kiritee Gak
              Jan 15 '17 at 18:38


















            • $begingroup$
              After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
              $endgroup$
              – Himanshu Ahuja
              Jan 15 '17 at 16:24






            • 2




              $begingroup$
              Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
              $endgroup$
              – Pierre
              Jan 15 '17 at 17:59










            • $begingroup$
              Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
              $endgroup$
              – Kiritee Gak
              Jan 15 '17 at 18:38
















            $begingroup$
            After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
            $endgroup$
            – Himanshu Ahuja
            Jan 15 '17 at 16:24




            $begingroup$
            After making the direction negative on the gradient descent, it only started working when I decreased the learning rate to 0.0001 from 0.001. On 0.001 it kept on switching predictions from 1 to -1 on each iteration.
            $endgroup$
            – Himanshu Ahuja
            Jan 15 '17 at 16:24




            2




            2




            $begingroup$
            Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
            $endgroup$
            – Pierre
            Jan 15 '17 at 17:59




            $begingroup$
            Indeed, you have to choose carefully your learning rate. If it is too big, your algorithm diverge. There are different ways to find a learning rate adapted to your situation, maybe this paper (part 5.1) will help you: cs.cmu.edu/~ggordon/10725-F12/scribes/10725_Lecture5.pdf
            $endgroup$
            – Pierre
            Jan 15 '17 at 17:59












            $begingroup$
            Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
            $endgroup$
            – Kiritee Gak
            Jan 15 '17 at 18:38




            $begingroup$
            Adding to @Pierre 's comment, take an sample function say $x^2+4$ and start with a guess say $5$ and keep changing the learning rates from $1$ to $0.1$ to $0.01$. You can the values of future $x$ being just jumping around the minimum in one case and lowering the learning rate stops this. But a learning rate above than this can sometimes do the same job of convergence more quicker as in the case of $0.1$ to $0.01$.
            $endgroup$
            – Kiritee Gak
            Jan 15 '17 at 18:38











            0












            $begingroup$

            If you want to do



            self.w_ += self.eta * X.T.dot(errors)


            like i like to do.



            you just have to change



            errors = output - y


            to



            errors = y - output


            Hope this helps : )






            share|improve this answer










            New contributor




            Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
            Check out our Code of Conduct.






            $endgroup$


















              0












              $begingroup$

              If you want to do



              self.w_ += self.eta * X.T.dot(errors)


              like i like to do.



              you just have to change



              errors = output - y


              to



              errors = y - output


              Hope this helps : )






              share|improve this answer










              New contributor




              Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
              Check out our Code of Conduct.






              $endgroup$
















                0












                0








                0





                $begingroup$

                If you want to do



                self.w_ += self.eta * X.T.dot(errors)


                like i like to do.



                you just have to change



                errors = output - y


                to



                errors = y - output


                Hope this helps : )






                share|improve this answer










                New contributor




                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






                $endgroup$



                If you want to do



                self.w_ += self.eta * X.T.dot(errors)


                like i like to do.



                you just have to change



                errors = output - y


                to



                errors = y - output


                Hope this helps : )







                share|improve this answer










                New contributor




                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.









                share|improve this answer



                share|improve this answer








                edited 19 hours ago









                Siong Thye Goh

                1,122418




                1,122418






                New contributor




                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.









                answered 20 hours ago









                Arjun KathuriaArjun Kathuria

                1




                1




                New contributor




                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.





                New contributor





                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






                Arjun Kathuria is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
                Check out our Code of Conduct.






























                    draft saved

                    draft discarded




















































                    Thanks for contributing an answer to Data Science Stack Exchange!


                    • Please be sure to answer the question. Provide details and share your research!

                    But avoid



                    • Asking for help, clarification, or responding to other answers.

                    • Making statements based on opinion; back them up with references or personal experience.


                    Use MathJax to format equations. MathJax reference.


                    To learn more, see our tips on writing great answers.




                    draft saved


                    draft discarded














                    StackExchange.ready(
                    function () {
                    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fdatascience.stackexchange.com%2fquestions%2f16324%2fmy-adaline-model-using-gradient-descent-is-increasing-error-on-each-iteration%23new-answer', 'question_page');
                    }
                    );

                    Post as a guest















                    Required, but never shown





















































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown

































                    Required, but never shown














                    Required, but never shown












                    Required, but never shown







                    Required, but never shown







                    Popular posts from this blog

                    Callistus I

                    Tabula Rosettana

                    How to label and detect the document text images